In [1]:
from google.colab import drive  
drive._mount('/content/drive')  

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# importing important libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier



In [3]:
x = pd.read_csv("/content/drive/MyDrive/Covid Prediction AI Project/X.csv")
y = pd.read_csv("/content/drive/MyDrive/Covid Prediction AI Project/Y.csv")

In [4]:
x

Unnamed: 0.1,Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,Female,Male,Age above 60,Abroad,Contact with confirmed
0,0,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,0
2,2,0,1,0,0,0,0,1,0,0,0
3,3,1,0,0,0,0,1,0,0,0,0
4,4,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
520011,520011,1,0,0,0,0,0,1,0,0,1
520012,520012,1,0,0,0,1,1,0,0,0,0
520013,520013,0,0,0,0,0,0,1,0,0,0
520014,520014,0,1,0,0,0,0,1,0,0,0


In [5]:
y

Unnamed: 0.1,Unnamed: 0,Result
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
520011,520011,1
520012,520012,1
520013,520013,1
520014,520014,1


In [6]:
x.drop(columns=['Unnamed: 0'], inplace=True)
y.drop(columns=['Unnamed: 0'], inplace=True)

## Splitting the Dataset into Training, Validation and Testing Data.

In [7]:
covid_df = pd.concat([x,y],axis=1)
covid_df

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,Female,Male,Age above 60,Abroad,Contact with confirmed,Result
0,0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
520011,1,0,0,0,0,0,1,0,0,1,1
520012,1,0,0,0,1,1,0,0,0,0,1
520013,0,0,0,0,0,0,1,0,0,0,1
520014,0,1,0,0,0,0,1,0,0,0,1


In [8]:
train_df,valid_df = train_test_split(covid_df,test_size=0.20,random_state=1)
train_df,test_df = train_test_split(train_df,test_size=0.10,random_state=1)

In [9]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(374410, 11)
(104004, 11)
(41602, 11)


In [10]:
train_df = train_df.reset_index()
valid_df = valid_df.reset_index()
test_df = test_df.reset_index()

In [11]:
train_df.drop(columns=['index'],inplace=True)
valid_df.drop(columns=['index'],inplace=True)
test_df.drop(columns=['index'],inplace=True)

In [12]:
independent_cols = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache',
                    'Female', 'Male', 'Age above 60', 'Abroad', 'Contact with confirmed']

target_col = ['Result']

In [13]:
x_train = train_df[independent_cols].copy()
y_train = train_df[target_col].copy()

x_valid = valid_df[independent_cols].copy()
y_valid = valid_df[target_col].copy()

x_test = test_df[independent_cols].copy()
y_test = test_df[target_col].copy()

## HyperParameter Tuning for RandomForest Classifier
The main parameters used by a Random Forest Classifier are:
- criterion = the function used to evaluate the quality of a split.
- max_depth = maximum number of levels allowed in each tree.
- max_features = maximum number of features considered when splitting a node.
- min_samples_leaf = minimum number of samples which can be stored in a tree leaf.
- min_samples_split = minimum number of samples necessary in a node to cause node splitting.
- n_estimators = number of trees in the ensemble.

### 1. HyperParameter Tuning - Randomized Search CV.

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:
rf=RandomForestClassifier()
skfold=StratifiedKFold(n_splits=3)
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=30,cv=skfold, verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(x_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 65.8min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 130.3min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
                   error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_sp...
                   pa

In [None]:
rf_randomcv.best_params_

{'criterion': 'gini',
 'max_depth': 340,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 14,
 'n_estimators': 2000}

In [None]:
best_random_grid=rf_randomcv.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
y_pred=best_random_grid.predict(x_valid)
print(confusion_matrix(y_valid,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_valid,y_pred)))
print("Classification report: {}".format(classification_report(y_valid,y_pred)))

[[47345  4718]
 [10581 41360]]
Accuracy Score 0.8528998884658282
Classification report:               precision    recall  f1-score   support

           0       0.82      0.91      0.86     52063
           1       0.90      0.80      0.84     51941

    accuracy                           0.85    104004
   macro avg       0.86      0.85      0.85    104004
weighted avg       0.86      0.85      0.85    104004



In [None]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [None]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [None]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, x_train, y_train, cv = skfold).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

  0%|          | 0/80 [00:00<?, ?it/s, best loss: ?]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  1%|▏         | 1/80 [03:34<4:42:27, 214.52s/it, best loss: -0.5300552838608027]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  2%|▎         | 2/80 [05:19<3:14:55, 149.94s/it, best loss: -0.6564301153067627]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  4%|▍         | 3/80 [05:25<1:48:12, 84.32s/it, best loss: -0.6564301153067627] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  5%|▌         | 4/80 [07:48<2:16:20, 107.64s/it, best loss: -0.6564301153067627]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  6%|▋         | 5/80 [10:47<2:46:30, 133.21s/it, best loss: -0.6564301153067627]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  8%|▊         | 6/80 [13:21<2:53:13, 140.46s/it, best loss: -0.6564301153067627]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



  9%|▉         | 7/80 [13:23<1:55:36, 95.02s/it, best loss: -0.6564301153067627] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 10%|█         | 8/80 [15:45<2:12:11, 110.16s/it, best loss: -0.6564301153067627]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 11%|█▏        | 9/80 [18:44<2:35:30, 131.41s/it, best loss: -0.6564301153067627]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 12%|█▎        | 10/80 [23:26<3:27:49, 178.14s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 14%|█▍        | 11/80 [26:56<3:36:02, 187.86s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 15%|█▌        | 12/80 [27:32<2:40:20, 141.48s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 16%|█▋        | 13/80 [27:33<1:50:37, 99.06s/it, best loss: -0.8322453913080935] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 18%|█▊        | 14/80 [27:35<1:16:41, 69.71s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 19%|█▉        | 15/80 [30:09<1:43:01, 95.09s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 20%|██        | 16/80 [30:10<1:11:22, 66.91s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 21%|██▏       | 17/80 [34:39<2:13:46, 127.41s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 22%|██▎       | 18/80 [36:31<2:07:02, 122.95s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 24%|██▍       | 19/80 [38:52<2:10:23, 128.26s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 25%|██▌       | 20/80 [40:21<1:56:25, 116.42s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 26%|██▋       | 21/80 [42:53<2:05:06, 127.23s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 28%|██▊       | 22/80 [44:58<2:02:16, 126.49s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 29%|██▉       | 23/80 [47:26<2:06:15, 132.90s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 30%|███       | 24/80 [48:22<1:42:40, 110.00s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 31%|███▏      | 25/80 [52:08<2:12:49, 144.89s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 32%|███▎      | 26/80 [52:15<1:32:56, 103.27s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 34%|███▍      | 27/80 [55:35<1:56:53, 132.32s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 35%|███▌      | 28/80 [57:16<1:46:28, 122.86s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 36%|███▋      | 29/80 [1:01:21<2:15:36, 159.54s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 38%|███▊      | 30/80 [1:02:48<1:54:58, 137.97s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 39%|███▉      | 31/80 [1:03:36<1:30:26, 110.75s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 40%|████      | 32/80 [1:03:43<1:03:42, 79.63s/it, best loss: -0.8322453913080935] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 41%|████▏     | 33/80 [1:05:53<1:14:24, 95.00s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 42%|████▎     | 34/80 [1:05:59<52:22, 68.31s/it, best loss: -0.8322453913080935]  

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 44%|████▍     | 35/80 [1:09:00<1:16:32, 102.06s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 45%|████▌     | 36/80 [1:11:39<1:27:19, 119.08s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 46%|████▋     | 37/80 [1:13:22<1:21:52, 114.24s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 48%|████▊     | 38/80 [1:16:14<1:32:11, 131.71s/it, best loss: -0.8322453913080935]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 49%|████▉     | 39/80 [1:17:55<1:23:31, 122.23s/it, best loss: -0.838431132087098] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 50%|█████     | 40/80 [1:18:47<1:07:30, 101.26s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 51%|█████▏    | 41/80 [1:19:53<58:57, 90.70s/it, best loss: -0.838431132087098]   

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 52%|█████▎    | 42/80 [1:20:58<52:31, 82.93s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 54%|█████▍    | 43/80 [1:22:26<52:02, 84.40s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 55%|█████▌    | 44/80 [1:27:58<1:35:17, 158.82s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 56%|█████▋    | 45/80 [1:30:18<1:29:15, 153.03s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 57%|█████▊    | 46/80 [1:30:20<1:01:03, 107.74s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 59%|█████▉    | 47/80 [1:31:26<52:22, 95.24s/it, best loss: -0.838431132087098]   

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 60%|██████    | 48/80 [1:35:09<1:11:12, 133.53s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 61%|██████▏   | 49/80 [1:38:02<1:15:13, 145.60s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 62%|██████▎   | 50/80 [1:38:08<51:52, 103.74s/it, best loss: -0.838431132087098]  

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 64%|██████▍   | 51/80 [1:38:11<35:25, 73.29s/it, best loss: -0.838431132087098] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 65%|██████▌   | 52/80 [1:39:03<31:12, 66.88s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 66%|██████▋   | 53/80 [1:42:29<48:55, 108.72s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 68%|██████▊   | 54/80 [1:45:00<52:35, 121.36s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 69%|██████▉   | 55/80 [1:45:45<41:01, 98.44s/it, best loss: -0.838431132087098] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 70%|███████   | 56/80 [1:50:08<59:09, 147.90s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 71%|███████▏  | 57/80 [1:52:39<56:59, 148.66s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 72%|███████▎  | 58/80 [1:52:40<38:19, 104.54s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 74%|███████▍  | 59/80 [1:52:48<26:28, 75.63s/it, best loss: -0.838431132087098] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 75%|███████▌  | 60/80 [1:55:18<32:35, 97.77s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 76%|███████▋  | 61/80 [1:58:48<41:41, 131.66s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 78%|███████▊  | 62/80 [1:59:30<31:23, 104.66s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 79%|███████▉  | 63/80 [2:03:30<41:08, 145.18s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 80%|████████  | 64/80 [2:07:20<45:28, 170.56s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 81%|████████▏ | 65/80 [2:07:21<29:58, 119.89s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 82%|████████▎ | 66/80 [2:09:42<29:24, 126.03s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 84%|████████▍ | 67/80 [2:11:50<27:27, 126.72s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 85%|████████▌ | 68/80 [2:13:57<25:21, 126.79s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 86%|████████▋ | 69/80 [2:16:03<23:11, 126.53s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 88%|████████▊ | 70/80 [2:18:01<20:39, 123.93s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 89%|████████▉ | 71/80 [2:18:52<15:18, 102.06s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 90%|█████████ | 72/80 [2:18:59<09:49, 73.75s/it, best loss: -0.838431132087098] 

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 91%|█████████▏| 73/80 [2:21:03<10:20, 88.59s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 92%|█████████▎| 74/80 [2:24:35<12:33, 125.63s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 94%|█████████▍| 75/80 [2:26:24<10:03, 120.76s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 95%|█████████▌| 76/80 [2:28:43<08:25, 126.35s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 96%|█████████▋| 77/80 [2:29:59<05:32, 110.99s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 98%|█████████▊| 78/80 [2:33:09<04:29, 134.75s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



 99%|█████████▉| 79/80 [2:34:54<02:05, 125.90s/it, best loss: -0.838431132087098]

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)

  estimator.fit(X_train, y_train, **fit_params)



100%|██████████| 80/80 [2:39:13<00:00, 119.42s/it, best loss: -0.838431132087098]


{'criterion': 1,
 'max_depth': 1030.0,
 'max_features': 3,
 'min_samples_leaf': 0.0007591618968705617,
 'min_samples_split': 0.21837298391834725,
 'n_estimators': 2}

In [None]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
None
300


In [None]:
best['min_samples_leaf']

0.0007591618968705617

In [None]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(x_train,y_train)
predictionforest = trainedforest.predict(x_valid)
print(confusion_matrix(y_valid,predictionforest))
print(accuracy_score(y_valid,predictionforest))
print(classification_report(y_valid,predictionforest))
acc5 = accuracy_score(y_valid,predictionforest)

  """


[[43223  8840]
 [ 8166 43775]]
0.8364870581900696
              precision    recall  f1-score   support

           0       0.84      0.83      0.84     52063
           1       0.83      0.84      0.84     51941

    accuracy                           0.84    104004
   macro avg       0.84      0.84      0.84    104004
weighted avg       0.84      0.84      0.84    104004



## 3. HyperParameter Tuning - Genetic Algorithms

In [15]:
!pip install TPOT

Collecting TPOT
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[?25l[K     |███▊                            | 10 kB 20.4 MB/s eta 0:00:01[K     |███████▌                        | 20 kB 24.6 MB/s eta 0:00:01[K     |███████████▎                    | 30 kB 24.2 MB/s eta 0:00:01[K     |███████████████                 | 40 kB 26.4 MB/s eta 0:00:01[K     |██████████████████▉             | 51 kB 29.3 MB/s eta 0:00:01[K     |██████████████████████▋         | 61 kB 31.6 MB/s eta 0:00:01[K     |██████████████████████████▎     | 71 kB 33.0 MB/s eta 0:00:01[K     |██████████████████████████████  | 81 kB 35.1 MB/s eta 0:00:01[K     |████████████████████████████████| 87 kB 6.4 MB/s 
[?25hCollecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting deap>=1.2
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K     |████████████████████████████████| 160 kB 47.0 MB/s 
Collecting up

In [14]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:
tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 3, scoring = 'accuracy')
tpot_classifier.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


Optimization Progress:   0%|          | 0/84 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8536898044783531

Generation 2 - Current best internal CV score: 0.8536898044783531

Generation 3 - Current best internal CV score: 0.8536898044783531

Generation 4 - Current best internal CV score: 0.8536898044783531


TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.
