# Modeling Sandbox

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Using Data where All Nulls were Dropped

I wanted to experiment the same classification models using the dataset where I straight up dropped **all** null values in the original training dataset.

In [13]:
# Load cleaned data
train = pd.read_csv('../data/processed/train_all_nulls_dropped.csv')

In [14]:
df = train.copy()

### Feature Engineering

Before modeling I had to perform the same feature engineering steps I did with the other dataset.

In [15]:
# Encoding the booleans in these two categories to 0 (False)m 1(True), and -1 (NaN) 
df['public_meeting'] = df['public_meeting'].fillna('Unknown')
df['public_meeting'] = df['public_meeting'].replace({'False': 0, 'True': 1, 'Unknown': -1})

df['permit'] = df['permit'].fillna('Unknown')
df['permit'] = df['permit'].replace({'False': 0, 'True': 1, 'Unknown': -1})

In [16]:
# Creating a new feature that represents the age of the pump
df['construction_year'] = df['construction_year'].replace(0, 10000) # Replacing 0 with 10000 to capture invalid pump ages
df['pump_age'] = pd.DatetimeIndex(df['date_recorded']).year - df['construction_year']

invalid_pump_age = df['pump_age'] < 0
df.loc[invalid_pump_age, 'pump_age'] = -1 

In [17]:
# Dictionary matching months to their corresponding seasons
# 0: ShortDry, 1: LongRainy, 2: LongDry, 3: ShortRainy
seasons = {
    1: 0, 2: 0,
    3: 1, 4: 1, 5: 1,
    6: 2, 7: 2, 8: 2, 9: 2, 10: 2,
    11: 3, 12: 3,
}

# Creating the 'season' column
df['season'] = pd.DataFrame({'Month': pd.DatetimeIndex(df['date_recorded']).month})
df['season'] = df['season'].apply(lambda month: seasons[month])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32543 entries, 0 to 32542
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date_recorded          32543 non-null  object 
 1   funder                 32543 non-null  object 
 2   gps_height             32543 non-null  float64
 3   installer              32543 non-null  object 
 4   longitude              32543 non-null  float64
 5   latitude               32543 non-null  float64
 6   basin                  32543 non-null  object 
 7   region                 32543 non-null  object 
 8   district_code          32543 non-null  int64  
 9   lga                    32543 non-null  object 
 10  population             32543 non-null  float64
 11  public_meeting         32543 non-null  bool   
 12  permit                 32543 non-null  bool   
 13  construction_year      32543 non-null  float64
 14  extraction_type_class  32543 non-null  object 
 15  ma

In [19]:
# Replace 'functional needs repair' with 'non functional'
df['target'] = df['target'].replace('functional needs repair', 'non functional')

In [22]:
df.to_csv('../data/final/train_all_nulls_dropped_final.csv.gz', index=False, compression='gzip')

In [12]:
train_features = df.drop('target', axis=1)
train_target = df['target']

In [13]:
# Unneeded and redundant columns to drop
columns_to_drop = ['id', 'wpt_name', 'num_private', 'subvillage', 'ward', 'recorded_by', 'scheme_name', 
                    'scheme_management', 'water_quality', 'waterpoint_type_group', 'quantity_group', 'region_code', 
                    'extraction_type', 'extraction_type_group', 'payment', 'source_class', 'source_type',
                    'funder', 'installer', 'longitude', 'latitude', 'date_recorded', 'construction_year',
                    'district_code']

In [16]:
columns_to_drop = [col for col in columns_to_drop if col in train_features.columns]

In [18]:
train_features = train_features.drop(columns=columns_to_drop)

In [20]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32543 entries, 0 to 32542
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gps_height             32543 non-null  float64
 1   basin                  32543 non-null  object 
 2   region                 32543 non-null  object 
 3   lga                    32543 non-null  object 
 4   population             32543 non-null  float64
 5   public_meeting         32543 non-null  bool   
 6   permit                 32543 non-null  bool   
 7   extraction_type_class  32543 non-null  object 
 8   management             32543 non-null  object 
 9   management_group       32543 non-null  object 
 10  payment_type           32543 non-null  object 
 11  quality_group          32543 non-null  object 
 12  quantity               32543 non-null  object 
 13  source                 32543 non-null  object 
 14  waterpoint_type        32543 non-null  object 
 15  pu

In [21]:
train_target = train_target.map({'functional': 0, 'non functional': 1})
train_target.value_counts()

target
0    18312
1    14231
Name: count, dtype: int64

### Encoding and Training Classification Models

In [22]:
# Import packages for categorical encoding
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from category_encoders import OrdinalEncoder
from category_encoders import CountEncoder
from category_encoders import HashingEncoder
from category_encoders import BackwardDifferenceEncoder
from category_encoders import HelmertEncoder
from category_encoders import CatBoostEncoder
from category_encoders import GLMMEncoder

In [23]:
# Encoding methods
random_state = 42

encoding_methods = {
    'ordinal': OrdinalEncoder(),
    'count': CountEncoder(),
    'hashing': HashingEncoder(n_components=32, drop_invariant=True),
    'backward_difference': BackwardDifferenceEncoder(),
    'Helmert': HelmertEncoder(),
    'CatBoost': CatBoostEncoder(random_state=random_state),
    'GLMM': GLMMEncoder(random_state=random_state)
}

In [24]:
# Classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(n_jobs=-1, random_state=random_state),
    'Decision Tree': DecisionTreeClassifier(random_state=random_state),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(random_state=random_state, n_jobs=-1),
}

In [25]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.2, random_state=random_state)

In [26]:
classifier_best_results = {classifier_name: (0, 'encoder') for classifier_name in classifiers.keys()}


for encoding_method, encoder in encoding_methods.items():
    print(f'\n----- Encoding data using {encoding_method} Encoder -----', end='')

    start_time = time.time()

    encoded_X_train = encoder.fit_transform(X_train, y_train)
    encoded_X_test = encoder.transform(X_test)

    end_time = time.time()
    print(f'Done in {round(end_time-start_time, 2)}s')

    for classifier_name, clf_algorithm in classifiers.items():
        print(f'Training {classifier_name} Classifier', end='')

        start_time = time.time()

        clf_algorithm.fit(encoded_X_train, y_train)
        y_pred = clf_algorithm.predict(encoded_X_test)
        acc = accuracy_score(y_test, y_pred)

        end_time = time.time()
        print(f'Done in {round(end_time-start_time, 2)}s')

        previous_acc, _ = classifier_best_results[classifier_name]
        if previous_acc < acc:
            classifier_best_results[classifier_name] = (acc, encoding_method)


for classifier_name, (score, encoding_method) in classifier_best_results.items():
    print(f'\nClassifier: {classifier_name}\tBest Score: {score} on Test Data\t Using {encoding_method} Encoder')


----- Encoding data using ordinal Encoder -----Done in 0.13s
Training Logistic Regression Classifier

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 2.45s
Training Decision Tree ClassifierDone in 0.1s
Training Naive Bayes ClassifierDone in 0.01s
Training Random Forest ClassifierDone in 0.39s

----- Encoding data using count Encoder -----Done in 0.13s
Training Logistic Regression Classifier

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 0.63s
Training Decision Tree ClassifierDone in 0.1s
Training Naive Bayes ClassifierDone in 0.01s
Training Random Forest ClassifierDone in 0.49s

----- Encoding data using hashing Encoder -----Done in 0.27s
Training Logistic Regression Classifier

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 0.65s
Training Decision Tree ClassifierDone in 0.12s
Training Naive Bayes ClassifierDone in 0.01s
Training Random Forest ClassifierDone in 0.39s

----- Encoding data using backward_difference Encoder -----



Done in 0.34s
Training Logistic Regression Classifier

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 0.89s
Training Decision Tree ClassifierDone in 0.23s
Training Naive Bayes ClassifierDone in 0.04s
Training Random Forest ClassifierDone in 0.49s

----- Encoding data using Helmert Encoder -----



Done in 0.32s
Training Logistic Regression ClassifierDone in 0.41s
Training Decision Tree ClassifierDone in 0.3s
Training Naive Bayes ClassifierDone in 0.03s
Training Random Forest ClassifierDone in 0.51s

----- Encoding data using CatBoost Encoder -----Done in 0.1s
Training Logistic Regression Classifier

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 0.66s
Training Decision Tree ClassifierDone in 0.51s
Training Naive Bayes ClassifierDone in 0.01s
Training Random Forest ClassifierDone in 1.24s

----- Encoding data using GLMM Encoder -----Done in 8.98s
Training Logistic Regression Classifier

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done in 0.79s
Training Decision Tree ClassifierDone in 0.11s
Training Naive Bayes ClassifierDone in 0.01s
Training Random Forest ClassifierDone in 0.4s

Classifier: Logistic Regression	Best Score: 0.7531110769703487 on Test Data	 Using GLMM Encoder

Classifier: Decision Tree	Best Score: 0.7807650944845599 on Test Data	 Using GLMM Encoder

Classifier: Naive Bayes	Best Score: 0.7320632969734214 on Test Data	 Using CatBoost Encoder

Classifier: Random Forest	Best Score: 0.8204025195882624 on Test Data	 Using GLMM Encoder


In [27]:
scores = []
pipelines = []

for classifier_name, metric in classifier_best_results.items():
    score, encoding_method = metric
    encoder = encoding_methods[encoding_method]
    encoded_X_train = encoder.fit_transform(X_train, y_train)
    encoded_X_test = encoder.transform(X_test)

    clf = classifiers[classifier_name]
    clf.fit(encoded_X_train, y_train)
    y_pred = clf.predict(encoded_X_test)

    scores.append(score)
    pipelines.append(classifier_name + ' - ' + encoding_method)

    print(f'\n-----Classifier: {classifier_name}\tEncoding: {encoding_method}-----')
    print(classification_report(y_test, y_pred))


-----Classifier: Logistic Regression	Encoding: GLMM-----
              precision    recall  f1-score   support

           0       0.74      0.86      0.80      3649
           1       0.78      0.62      0.69      2860

    accuracy                           0.75      6509
   macro avg       0.76      0.74      0.74      6509
weighted avg       0.76      0.75      0.75      6509


-----Classifier: Decision Tree	Encoding: GLMM-----
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      3649
           1       0.75      0.75      0.75      2860

    accuracy                           0.78      6509
   macro avg       0.78      0.78      0.78      6509
weighted avg       0.78      0.78      0.78      6509


-----Classifier: Naive Bayes	Encoding: CatBoost-----
              precision    recall  f1-score   support

           0       0.71      0.87      0.78      3649
           1       0.77      0.56      0.65      2860

    accuracy      