# Modeling Scratchwork

## Loading in Libraries and Data

In [32]:
# Packages for data cleaning, plotting, and manipulation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# scikit-learn libraries/functions/classes

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, RidgeCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
                             StackingClassifier, ExtraTreesClassifier
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer

In [33]:
# Shows *all* columns in dataframe, i.e. does not truncate horizontally;
# feel free to comment out if undesired

pd.set_option('display.max_columns', None)

In [34]:
# Importing training data
train_val = pd.read_csv('../data/training_set_values.csv')

# Only using `status_group` column from label set, to
# avoid duplicating `id` column
train_label = pd.read_csv('../data/training_set_labels.csv',
                             usecols = ['status_group'])

In [35]:
# Concatenating separate .csv files
df = pd.concat(objs = [train_val, train_label],
               axis = 1)

## Preprocessing

### Feature Selection

In [36]:
# Dropping columns determined to be either irrelevant or
# superfluous in exploratory analysis

cols_to_drop = [
    'id',  # unique identifier, not useful for modeling
    'date_recorded',  # superfluous information, too many unique records
    'recorded_by',  # no unique information + no unique values across 59.4k rows
    'funder',   
    'installer',  # large number of unique values (see also `funder`);
                  # may be added back in later
    'wpt_name',  # identifier column, not useful for modeling
    'num_private',  # data dict. does not provide details for this column
    'subvillage',  # too many unique values - uninformative for modeling
    'region_code',  # redundant information vis-a-visa the simpler `region`
    'district_code',  # may be added back in later
    'lga',
    'ward',  # redundant location data (with `lga`)
    'scheme_management',  # may be added back in later
    'scheme_name',  # large number of nulls, redundant vis-a-vis `scheme_management`
    'extraction_type',
    'extraction_type_group',  # using `extraction_type_class` for generalized info
    'management',
    'management_group',  # may be added back in later
    'payment',  # identical information to `payment_type`
    'water_quality',  # comparable information to `quality_group` - redundant
    'quantity_group',  # identical information to `quantity` - redundant
    'source',  # redundant with other `source_` columns
    'waterpoint_type_group'  # used `waterpoint_type` instead
]

In [37]:
df = df.drop(columns = cols_to_drop).copy()

### Pipeline Creation

Necessary modifications for modeling, to be written into pipelines:

- One-hot encoding:
    - `basin`
    - `extraction_type_class`
    - `payment_type`
    - `permit`
    - `public_meeting`
    - `quality_group`
    - `quantity`
    - `region`
    - `source_type`
    - `source_class`
    - `waterpoint_type`
- Numerical scaling:
    - `population` - (impute zeroes with median?)
    - `gps_height` - impute zeroes with median
    - `latitude` / `longitude` - impute zeroes with mean
    - `construction_year` - use KNN imputing
    
Our target variable, `status_group`, will also be **label encoded** for readability and consistency:
- `0` = 'functional'
- `1` = 'non functional'
- `2` = 'functional needs repair'

In [38]:
# Subpipes for imputing median values - to be used for `latitude` and `longitude`
subpipe_lat      = Pipeline(steps=[('num_impute', SimpleImputer(missing_values = -2.000000e-08,
                                                                strategy = 'median')),
                                   ('ss', StandardScaler())])

subpipe_long     = Pipeline(steps=[('num_impute', SimpleImputer(missing_values = 0.000000,
                                                                strategy = 'median')),
                                   ('ss', StandardScaler())])


# Subpipe for imputing median values
subpipe_num      = Pipeline(steps=[('num_impute', SimpleImputer(strategy = 'median')),
                                   ('ss', StandardScaler())])

# Subpipe for `construction_year`
subpipe_year     = Pipeline(steps=[('num_impute', SimpleImputer(missing_values = 0,
                                                                strategy = 'median')),
                                   ('ss', StandardScaler())])

# Subpipe for categorical features including `basin`, `payment_type`
subpipe_cat      = Pipeline(steps=[('freq_imputer_nan', SimpleImputer(strategy = 'most_frequent')),
                                   ('freq_imputer_unk', SimpleImputer(strategy = 'most_frequent',
                                                                      missing_values = 'unknown')),
                                   ('ohe', OneHotEncoder(drop = 'if_binary',
                                                         sparse = False,
                                                         handle_unknown = 'ignore'))])


# Subpipe for the target column, `status_group`
subpipe_label    = Pipeline(steps=[('le', LabelEncoder())])

In [39]:
# Columns to be passed through numerical pipeline
num_cols = ['amount_tsh',
            'gps_height',
            'population']

# Columns to be passed through categorical pipeline
cat_cols = ['basin',
            'region',
            'payment_type',
            'quantity',
            'quality_group',
            'permit',
            'public_meeting',
            'extraction_type_class',
            'source_type',
            'source_class',
            'waterpoint_type']

In [40]:
ct = ColumnTransformer(transformers = [
    ('subpipe_num', subpipe_num, num_cols),
    ('subpipe_year', subpipe_year, ['construction_year']),
    ('subpipe_long', subpipe_long, ['longitude']),
    ('subpipe_lat', subpipe_lat, ['latitude']),
    ('subpipe_cat', subpipe_cat, cat_cols)],
                       remainder = 'passthrough')

# ('subpipe_label', subpipe_label, ['status_group'])

### Train/Test Split and Initial Preparation for ML

In [41]:
# Splitting DataFrame into features/values DataFrame
# (i.e. `X`) and labels series (`y`)

X = df.drop('status_group', axis = 1)
y = df['status_group']

In [42]:
# Splitting internal training data into separate
# training and test sets for (eventual) internal validation

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 138)

## Modeling

### `DummyClassifier`

In [43]:
dummy_model_pipe = Pipeline(steps=[
    ('ct', ct),
    ('dummy', DummyClassifier())
])

In [44]:
# Fit on training data
dummy_model_pipe.fit(X_train, y_train)

# Score on training data
dummy_model_pipe.score(X_train, y_train)

0.5420875420875421

scikit-learn's `DummyClassifier` predicts on the training data with an accuracy score of ~0.542, equal to the proportion of the **most frequent class** (`functional`). This is because, as a dummy model, it predicts `functional` (the most frequent value) every time.

We'll be looking to improve on that 54.2% accuracy in future models.

### Simple Models - `LogisticRegression`

#### Simple Model 1 - Default hyperparameters

In [45]:
# Default arguments - max iterations set to 100
logreg_model_simple = Pipeline(steps=[
    ('ct', ct),
    ('logreg', LogisticRegression(random_state = 138))
])

In [46]:
logreg_model_simple.fit(X_train, y_train)

logreg_model_simple.score(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7270968390371375

> First simple model is giving us an accuracy rate of **~72.71%** - and a `ConvergenceWarning`.

##### Simple Model 1.1 - increasing `max_iter`

In [47]:
# Increasing `max_iter` to 1000
logreg_model_more_iter = Pipeline(steps=[
    ('ct', ct),
    ('logreg', LogisticRegression(random_state = 138,
                                  max_iter = 1000))
])

In [48]:
logreg_model_more_iter.fit(X_train, y_train)

logreg_model_more_iter.score(X_train, y_train)

0.7271722197095332

> Increasing `max_iter` from 100 to 1,000 solved the `ConvergenceWarning`; it also resulted in a *slightly* higher accuracy score: **~72.72%**

#### Simple Model 2 - new solver (`saga`)

In [49]:
# Adjusting solver - changed from 'lbfgs' to 'saga'
# also dropping `max_iter` back down to default
logreg_model_saga = Pipeline(steps=[
    ('ct', ct),
    ('logreg', LogisticRegression(random_state = 138,
                                  solver = 'saga'))
])

In [50]:
logreg_model_saga.fit(X_train, y_train)

logreg_model_saga.score(X_train, y_train)



0.727021458364742

> Ever-so-slightly worse accuracy score (**~72.70%**) when the solver was changed from `lbfgs` to `saga`. But *still* getting that `ConvergenceWarning`.

#### Simple Model 3 - `elasticnet` penalty

In [51]:
# Changed penalty to 'elasticnet', set 'l1_ratio' to 0.6
# Reduced max_iter to 10 -  not much sacrifice in accuracy,
# better processing time w/ ensemble methods
logreg_model_saga_elasnet = Pipeline(steps=[
    ('ct', ct),
    ('logreg', LogisticRegression(penalty = 'elasticnet',
                                  l1_ratio = 0.6,
                                  solver = 'saga',
                                  random_state = 138,
                                  max_iter = 10))
])

In [52]:
logreg_model_saga_elasnet.fit(X_train, y_train)

logreg_model_saga_elasnet.score(X_train, y_train)



0.7217699381878486

**~72.70%** - these scores aren't moving very much with manual tuning.

> Note: When we reduce `max_iter` to 10, our accuracy only drops to **~72.18%**. I will use this in the `StackingClassifier` later.

####  Simple Model 4 - Reduced regularization (`C`)

Let's try tuning *one* more hyperparameter in the `LogisticRegression` class before moving on to something else. We'll also drop `max_iter` back down to the default 100 to reduce the amount of processing time needed in future calculations.

In [53]:
# Reduced regularization by inflating C parameter
logreg_less_reg = Pipeline(steps=[
    ('ct', ct),
    ('logreg', LogisticRegression(C = 1e5,
                                  penalty = 'elasticnet',
                                  l1_ratio = 0.6,
                                  solver = 'saga',
                                  random_state = 138))
])

In [54]:
logreg_less_reg.fit(X_train, y_train)

logreg_less_reg.score(X_train, y_train)



0.7269712045831449

No better when the `C` value is set slightly higher, i.e. reduced regularization - **~72.70%**.

We're capping out at around **~72.72% accuracy** with various logistic regression models using this set of features. We're also still getting the `ConvergenceWarning`.

### `RandomForestClassifier`

#### RFC Model 1

In [55]:
# Simple RFC - minimal changes from default hyperparams
# Starting small w/ max_depth = 10
rfc_model_pipe = Pipeline(steps=[
    ('ct', ct),
    ('rfc', RandomForestClassifier(max_depth = 10,
                                   random_state = 138))
])

In [56]:
rfc_model_pipe.fit(X_train, y_train)

rfc_model_pipe.score(X_train, y_train)

0.7623498668274787

> Accuracy score on the RFC model with default parameters is around **76.23%**, already nearly a 3.5% increase from our best logistic regression model.

#### RFC Model 2 - `max_features` to `sqrt`

In [57]:
rfc_pipe_two = Pipeline(steps=[
    ('ct', ct),
    ('rfc', RandomForestClassifier(max_features = 'sqrt',
                                   max_depth = 10,
                                   random_state = 138))
])

In [58]:
rfc_pipe_two.fit(X_train, y_train)

rfc_pipe_two.score(X_train, y_train)

0.7623498668274787

> Identical accuracy score (76.23%) to our first RFC model, even after modifying `max_features` hyperparameter.

### `StackingClassifier`

#### SC Model 1

In [59]:
estimators = [
    ('logreg_model', logreg_model_saga_elasnet),
    ('rfc_model', rfc_pipe_two)
]

sc_model_pipe = StackingClassifier(estimators)

In [60]:
sc_model_pipe.fit(X_train, y_train)

sc_model_pipe.score(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7882556912407659

Our third logistic regression model - `logreg_model_saga_elasnet` - and our second RFC model - `rfc_pipe_two` - yielded an accuracy rate of **~78.83%** when stacked, our best so far.

It also took **two minutes** to run.

### `ExtraTreesClassifier`

### `GradientBoostingClassifier`

### `XGBoost`

### `GridSearchCV`