In [69]:
#Utility packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Preprocessing packages
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

#Classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [33]:
data = pd.read_csv('water_potability.csv')

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [35]:
#So we know that there are 10 columns. All are numerical.
#The last one is Potability, our target value (1 or 0)
#We see that three of the columns are missing values. We will have to deal with that

In [36]:
data.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [37]:
#That gives us a sense of the distribution of the values.
#We see that some ranges are very narrow and some are very wide.
    #Hence we should use some normalization
#From the above two we know that the data has 3276 rows and 10 columns

In [38]:
data.dropna().shape[0]/data.shape[0]

0.6138583638583639

In [39]:
#Thus we know that simply dropping all of the rows that have a null in them might be problematic.
#We'd be left with only abot 61% of our data.
#So imputing values is a good idea, and we'll want a pipeline to test imputing methods.

In [40]:
#The first step in preprocessing the data is to split it up into training and testing sets.
#We want three sets - training, validation, and testing
#Ultimately the goal is 60% train, 20% validation, and 20% test
#So we will start by splitting off the 60% of training data
#And then we split the "not-train" set in half to generate the validation and testing sets.

In [41]:
train_set, val_set = train_test_split(data, test_size=.4, random_state=42)
valid_set, test_set = train_test_split(val_set, test_size = .5, random_state = 42)

In [42]:
#In order to fit any models we';ll have to separate the features (predictors) fromt he values in each set.
train_set_values = train_set.pop('Potability')
valid_set_values = valid_set.pop('Potability')
test_set_values = test_set.pop('Potability')

In [43]:
#And now we can run it through some machine learning classification algorithms to work out a relationship
    #that will give good prediction accuracy.

In [46]:
#Logistic Regression
#logistic pipe

full_pipe_log = Pipeline([
    ('imputer', SimpleImputer()), #there will, ultimately, be three options here.
        #mean, median, and node
    ('scaler', StandardScaler()), #this is statistical normalization (z - mu)/s.d.
    ('classifier', LogisticRegression())
])

param_grid = {
        'imputer__strategy': ["most_frequent","mean", "median"],
        'classifier__C': [10**10,10**9,10**8],
        'classifier__penalty': ['l1', 'l2']
        }

method_search = GridSearchCV(
    full_pipe_log
    , param_grid
    , cv = 5
    , scoring = 'neg_mean_squared_error'
    , verbose = 4
    , n_jobs = 6)
method_search.fit(train_set, train_set_values)
print(method_search.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:    7.9s


{'imputer__strategy': 'mean', 'regressor__C': 10000000000, 'regressor__penalty': 'l2'}


[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:    8.4s finished


In [47]:
#And then we can test its accuracy using the validation set
pred_values = method_search.predict(valid_set)
ct = np.array(pd.crosstab(pred_values, valid_set_values))
print(pred_values)
print(ct)
print(ct.trace()/ct.sum().sum())

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [50]:
#So the logistic regression predicts everything is a zero ("not potable"), and that is correct in 63.97% of cases.

In [53]:
#What about a Support Vector Machine (SVM) Classifier/
full_pipe_SV = Pipeline([
    ('imputer', SimpleImputer()), #there will, ultimately, be three options here.
        #mean, median, and node
    ('scaler', StandardScaler()), #this is statistical normalization (z - mu)/s.d.
    ('classifier', SVC()) #will need to specify kernel type (linear, polynomial, RBF)
])
#Also specify regularization term C for how much error is OK
#Also specify gamma for tightness of fit
param_grid = {
        'imputer__strategy': ["most_frequent","mean", "median"],
        'classifier__C': [10**1,10**0,10**-1]
        }

method_search = GridSearchCV(
    full_pipe_SV
    , param_grid
    , cv = 5
    , scoring = 'neg_mean_squared_error'
    , verbose = 4
    , n_jobs = 6)
method_search.fit(train_set, train_set_values)
print(method_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:    4.5s


{'imputer__strategy': 'mean', 'regressor__C': 1}


[Parallel(n_jobs=6)]: Done  45 out of  45 | elapsed:    5.7s finished


In [54]:
#And we can check the performance by corsstabuation again
import numpy as np
pred_values = method_search.predict(valid_set)
ct = np.array(pd.crosstab(pred_values, valid_set_values))
print(pred_values)
print(ct)
print(ct.trace()/ct.sum().sum())

[1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1
 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 

In [55]:
#Not everything is zero anymore.
#Accuracy is still only 69.62% That's only about 5% better than the logistic regression.

In [57]:
#What about K Nearest Neighbors? Maybe that would give better results?
full_pipe_KNN = Pipeline([
    ('imputer', SimpleImputer()), #there will, ultimately, be three options here.
        #mean, median, and node
    ('scaler', StandardScaler()), #this is statistical normalization (z - mu)/s.d.
    ('classifier', KNeighborsClassifier())  #need to specify number of neighbors. Do this in the grid search
])

param_grid = {
        'imputer__strategy': ["most_frequent","mean", "median"],
        'classifier__n_neighbors': [3,4,5],
        'classifier__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'classifier__weights':['uniform','distance'],
        'classifier__leaf_size':[1,2,3,4,5],
        'classifier__p':[1,2],
        }

method_search = GridSearchCV(
    full_pipe_KNN
    , param_grid
    , cv = 5
    , scoring = 'neg_mean_squared_error'
    , verbose = 4
    , n_jobs = 6)
method_search.fit(train_set, train_set_values)
print(method_search.best_params_)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  14 tasks      | elapsed:    1.7s
[Parallel(n_jobs=6)]: Done 108 tasks      | elapsed:    5.1s
[Parallel(n_jobs=6)]: Done 600 tasks      | elapsed:   14.5s
[Parallel(n_jobs=6)]: Done 1284 tasks      | elapsed:   24.0s
[Parallel(n_jobs=6)]: Done 2168 tasks      | elapsed:   38.0s


{'imputer__strategy': 'median', 'regressor__algorithm': 'ball_tree', 'regressor__leaf_size': 1, 'regressor__n_neighbors': 4, 'regressor__p': 2, 'regressor__weights': 'uniform'}


[Parallel(n_jobs=6)]: Done 2700 out of 2700 | elapsed:   50.1s finished


In [58]:
pred_values = method_search.predict(valid_set)
ct = np.array(pd.crosstab(pred_values, valid_set_values))
print(pred_values)
print(ct)
print(ct.trace()/ct.sum().sum())

[0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1
 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [59]:
#That's worse than the Support Vector Classifier
#66.26%

In [61]:
#Let's try a random forest classifier
full_pipe_RF = Pipeline([
    ('imputer', SimpleImputer()), #there will, ultimately, be three options here.
        #mean, median, and node
    ('scaler', StandardScaler()), #this is statistical normalization (z - mu)/s.d.
    ('classifier', RandomForestClassifier())  
])
#max depth

param_grid = {
        'imputer__strategy': ["most_frequent","mean", "median"],
        'classifier__n_estimators': [800,1000,1200],
        'classifier__min_samples_leaf':[1,2],
        'classifier__criterion': ['gini', 'entropy']
        }


method_search = GridSearchCV(
    full_pipe_RF
    , param_grid
    , cv = 5
    , scoring = 'neg_mean_squared_error'
    , verbose = 4
    , n_jobs = 6)
method_search.fit(train_set, train_set_values)
print(method_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   24.9s
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  2.8min
[Parallel(n_jobs=6)]: Done 180 out of 180 | elapsed:  6.8min finished


{'imputer__strategy': 'median', 'regressor__criterion': 'gini', 'regressor__min_samples_leaf': 1, 'regressor__n_estimators': 1200}


In [63]:
pred_values = method_search.predict(valid_set)
ct = np.array(pd.crosstab(pred_values, valid_set_values))
print(pred_values)
print(ct)
print(ct.trace()/ct.sum().sum())

[0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 1
 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 

In [64]:
#69.31%
#So random forest is the best I've see so far. Still less than 70 percent.

In [66]:
#How about XGBoost?
full_pipe_XGB = Pipeline([
    ('imputer', SimpleImputer()), #there will, ultimately, be three options here.
        #mean, median, and node
    ('scaler', StandardScaler()), #this is statistical normalization (z - mu)/s.d.
    ('classifier', XGBClassifier())  
])
#max depth

param_grid = {
        'imputer__strategy': ["most_frequent","mean", "median"],
        'classifier__learning_rate': [.07,.05,.03],
        'classifier__max_depth': [30,35,40],
        'classifier__n_estimators': [40,50,60]
        }


method_search = GridSearchCV(
    full_pipe_XGB
    , param_grid
    , cv = 5
    , scoring = 'neg_mean_squared_error'
    , verbose = 4
    , n_jobs = 6)
method_search.fit(train_set, train_set_values)
print(method_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:    3.7s
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:   29.3s
[Parallel(n_jobs=6)]: Done 209 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 380 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done 405 out of 405 | elapsed:  2.4min finished


{'imputer__strategy': 'mean', 'regressor__learning_rate': 0.05, 'regressor__max_depth': 35, 'regressor__n_estimators': 50}


In [67]:
pred_values = method_search.predict(valid_set)
ct = np.array(pd.crosstab(pred_values, valid_set_values))
print(pred_values)
print(ct)
print(ct.trace()/ct.sum().sum())

[0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 0 1
 0 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0
 0 1 0 1 1 0 0 0 0 0 1 1 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 0
 0 1 1 0 1 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0
 1 0 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 

In [None]:
#68.55%. No improvement.

In [75]:
#Does Light GBM boost do any better?
full_pipe_LGB = Pipeline([
    ('imputer', SimpleImputer()), #there will, ultimately, be three options here.
        #mean, median, and node
    ('scaler', StandardScaler()), #this is statistical normalization (z - mu)/s.d.
    ('classifier', LGBMClassifier())  
])
#max depth

param_grid = {
        'imputer__strategy': ["most_frequent","mean", "median"],
        'classifier__num_leaves': [10,30],
        'classifier__max_depth': [16,32],
        'classifier__learning_rate': [.01,.05],
        'classifier__n_estimators': [20,50],
        'classifier__reg_alpha': [0,.05,.1],
        'classifier__reg_lambda': [0,.05,.1],
        }


method_search = GridSearchCV(
    full_pipe_LGB
    , param_grid
    , cv = 5
    , scoring = 'neg_mean_squared_error'
    , verbose = 4
    , n_jobs = 6)
method_search.fit(train_set, train_set_values)
print(method_search.best_params_)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  14 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done 260 tasks      | elapsed:    3.0s
[Parallel(n_jobs=6)]: Done 752 tasks      | elapsed:    9.5s
[Parallel(n_jobs=6)]: Done 1178 tasks      | elapsed:   21.8s
[Parallel(n_jobs=6)]: Done 1828 tasks      | elapsed:   32.0s


{'classifier__learning_rate': 0.05, 'classifier__max_depth': 16, 'classifier__n_estimators': 50, 'classifier__num_leaves': 30, 'classifier__reg_alpha': 0.1, 'classifier__reg_lambda': 0.1, 'imputer__strategy': 'median'}


[Parallel(n_jobs=6)]: Done 2160 out of 2160 | elapsed:   39.6s finished


In [None]:
#{'classifier__learning_rate': 0.05, 
    #'classifier__max_depth': 16, 
    #'classifier__n_estimators': 50, 
    #'classifier__num_leaves': 30, 
    #'classifier__reg_alpha': 0.1, 
    #'classifier__reg_lambda': 0.1, 
    #'imputer__strategy': 'median'}
#Even with the speed of LBGMClassifier this takes a while if the parameter grid is too large.
#Alternatively that means that I can test more parameter sets.
#But maybe I should just start using optuna all the time and give up on gridsearch

In [76]:
pred_values = method_search.predict(valid_set)
ct = np.array(pd.crosstab(pred_values, valid_set_values))
print(pred_values)
print(ct)
print(ct.trace()/ct.sum().sum())

[0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 1
 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0
 1 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
#68.70% Marginally better than XGBoost.
#SVC is still best.