In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

dataset_url = "https://drive.google.com/uc?id=1MDZ8rsTEW8ETdiUXwdBKM9HW-Re92VAX"
df = pd.read_csv(dataset_url)


In [12]:
print(df.describe())
print('§§§§§§§§§§§')
print(df.isnull().sum())
print('§§§§§§§§§§§')
print(df.info())

               year  bank_account  household_size  age_of_respondent
count  23524.000000  23524.000000    23524.000000       23524.000000
mean    2016.975939      0.140792        3.797483          38.805220
std        0.847371      0.347815        2.227613          16.520569
min     2016.000000      0.000000        1.000000          16.000000
25%     2016.000000      0.000000        2.000000          26.000000
50%     2017.000000      0.000000        3.000000          35.000000
75%     2018.000000      0.000000        5.000000          49.000000
max     2018.000000      1.000000       21.000000         100.000000
§§§§§§§§§§§
country                   0
year                      0
uniqueid                  0
bank_account              0
location_type             0
cellphone_access          0
household_size            0
age_of_respondent         0
gender_of_respondent      0
relationship_with_head    0
marital_status            0
education_level           0
job_type                  0
dty

In [9]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,1,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,0,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,1,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,0,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,0,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib



# Drop unnecessary columns

# Encode categorical features
categorical_columns = [
    "country",
    "location_type", # Removed 'bank_account' from this list
    "cellphone_access",
    "gender_of_respondent",
    "relationship_with_head",
    "marital_status",
    "education_level",
    "job_type",
]
target_column = "bank_account"  # Assuming this is the target variable

# Encoding the target variable
le = LabelEncoder()
df[target_column] = le.fit_transform(df[target_column])

# Define feature matrix (X) and target vector (y)
X = df.drop(columns=[target_column, 'uniqueid']) # Dropping 'uniqueid' here
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
    ],
    remainder="passthrough",  # Pass through numerical columns
)

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Set up a pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", rf)])

# Set up GridSearchCV parameters
param_grid = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [10, 20, None],
    "classifier__min_samples_split": [2, 5, 10],
}
grid_search = GridSearchCV(
    pipeline, param_grid, cv=3, scoring="accuracy", verbose=3, n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Evaluate the model
y_pred = grid_search.best_estimator_.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the model for Streamlit
joblib.dump(grid_search.best_estimator_, "rf_model.pkl")
print("Model saved as rf_model.pkl")

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Accuracy: 0.8913921360255048
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      4063
           1       0.75      0.30      0.43       642

    accuracy                           0.89      4705
   macro avg       0.83      0.64      0.69      4705
weighted avg       0.88      0.89      0.87      4705

Model saved as rf_model.pkl
