In [1]:
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from stephentools import get_file
from stephentools import data_d3
from stephentools import analyse_feature_importance
from stephentools import visualize_decision_tree
from sklearn.preprocessing import StandardScaler

In [2]:
df = get_file()
df,X,y,X_train, X_test, y_train, y_test=data_d3()
rs=42

In [3]:
model_2 = MLPClassifier(max_iter=250, random_state=rs)
model_2.fit(X_train, y_train)

print("Train accuracy:", model_2.score(X_train, y_train))
print("Test accuracy:", model_2.score(X_test, y_test))

y_pred = model_2.predict(X_test)
print(classification_report(y_test, y_pred))

print(model_2)

Train accuracy: 0.8616317530319736
Test accuracy: 0.799055967633176
              precision    recall  f1-score   support

       False       0.79      0.92      0.85       940
        True       0.82      0.58      0.68       543

    accuracy                           0.80      1483
   macro avg       0.80      0.75      0.77      1483
weighted avg       0.80      0.80      0.79      1483

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=250, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)


In [4]:
model_z = MLPClassifier(hidden_layer_sizes=165, max_iter=200, solver='adam', random_state=42, activation='tanh',alpha= 0.05)
model_z.fit(X_train, y_train)

print("Train accuracy:", model_z.score(X_train, y_train))
print("Test accuracy:", model_z.score(X_test, y_test))

y_pred = model_z.predict(X_test)
print(classification_report(y_test, y_pred))

print(model_z)

Train accuracy: 0.8748621830209482
Test accuracy: 0.8091706001348618
              precision    recall  f1-score   support

       False       0.82      0.89      0.86       940
        True       0.78      0.66      0.72       543

    accuracy                           0.81      1483
   macro avg       0.80      0.78      0.79      1483
weighted avg       0.81      0.81      0.81      1483

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=165, learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)




In [17]:
params = {'hidden_layer_sizes': [(x,) for x in range(1,20, 1)],'activation': ['tanh'],
    'solver': ['adam'], 'alpha':[0.05]}

cv_sel_model = GridSearchCV(param_grid=params, estimator=MLPClassifier(random_state=rs),return_train_score=True, cv=10, n_jobs=-1)
cv_sel_model.fit(X_train_sel_model, y_train)

print("Train accuracy:", cv_sel_model.score(X_train_sel_model, y_train))
print("Test accuracy:", cv_sel_model.score(X_test_sel_model, y_test))

y_pred = cv_sel_model.predict(X_test_sel_model)
print(classification_report(y_test, y_pred))

print(cv_sel_model.best_params_)

Train accuracy: 0.804851157662624
Test accuracy: 0.7929871881321645
              precision    recall  f1-score   support

       False       0.82      0.86      0.84       940
        True       0.73      0.68      0.71       543

    accuracy                           0.79      1483
   macro avg       0.78      0.77      0.77      1483
weighted avg       0.79      0.79      0.79      1483

{'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (16,), 'solver': 'adam'}


In [12]:
import pickle
with open('DT.pickle', 'rb') as f:
    dt_best,roc_index_dt_best, fpr_dt_best, tpr_dt_best = pickle.load(f)

print(dt_best)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=17,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


In [13]:
analyse_feature_importance(dt_best, X.columns)

covid19_symptoms : 0.3441404242821123
income_med : 0.18735999132581302
worried : 0.10684367064060209
working_travel critical : 0.05893549513836796
health_worker : 0.029681197797928313
house_count : 0.028700386981849273
insurance : 0.023567387237388056
risk_mortality : 0.022105521176654413
race_white : 0.021172434763189578
weight : 0.019046383166260582
contacts_count : 0.017111972739106446
covid19_contact : 0.01633985712469371
height : 0.015814759630596104
age_70_80 : 0.011635449823777757
age_60_70 : 0.01103832427484473
immigrant : 0.010870020919422537
country_BR : 0.010800754351478778
age_20_30 : 0.00917021340207108
country_US : 0.008079498762168525
age_40_50 : 0.0077680328706121425


In [14]:
from sklearn.feature_selection import SelectFromModel

selectmodel = SelectFromModel(dt_best, prefit=True)
X_train_sel_model = selectmodel.transform(X_train)
X_test_sel_model = selectmodel.transform(X_test)

print(X_train_sel_model.shape)

(3628, 21)


In [18]:
model_z = MLPClassifier(hidden_layer_sizes=16, max_iter=200, solver='adam', random_state=42, activation='tanh',alpha= 0.05)
cv_sel_model.fit(X_train_sel_model, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=200,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_st...
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'activation': ['tanh'], 'alpha': [0.05],
                         'hidden_layer_sizes': [(1,), (2,), (3,), (4,), (5,),
       

In [19]:
print("Train accuracy:", cv_sel_model.score(X_train_sel_model, y_train))
print("Test accuracy:", cv_sel_model.score(X_test_sel_model, y_test))

y_pred = cv_sel_model.predict(X_test_sel_model)
print(classification_report(y_test, y_pred))

Train accuracy: 0.804851157662624
Test accuracy: 0.7929871881321645
              precision    recall  f1-score   support

       False       0.82      0.86      0.84       940
        True       0.73      0.68      0.71       543

    accuracy                           0.79      1483
   macro avg       0.78      0.77      0.77      1483
weighted avg       0.79      0.79      0.79      1483



In [20]:
X_train_sel_model

array([[146, 56, False, ..., 1, 1, 0],
       [192, 100, True, ..., 1, 1, 0],
       [168, 68, True, ..., 1, 0, 0],
       ...,
       [166, 80, True, ..., 1, 1, 0],
       [178, 52, False, ..., 1, 0, 0],
       [170, 68, True, ..., 0, 0, 1]], dtype=object)

In [21]:
model_z = MLPClassifier(hidden_layer_sizes=165, max_iter=1000, solver='adam', random_state=42, activation='tanh',alpha= 0.05)
model_z.fit(X_train, y_train)

print("Train accuracy:", model_z.score(X_train, y_train))
print("Test accuracy:", model_z.score(X_test, y_test))

y_pred = model_z.predict(X_test)
print(classification_report(y_test, y_pred))

print(model_z)

Train accuracy: 0.863561190738699
Test accuracy: 0.7936614969656103
              precision    recall  f1-score   support

       False       0.78      0.94      0.85       940
        True       0.85      0.53      0.65       543

    accuracy                           0.79      1483
   macro avg       0.81      0.74      0.75      1483
weighted avg       0.80      0.79      0.78      1483

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=165, learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)


In [34]:
y_pred

array([ True,  True, False, ..., False, False,  True])

In [7]:
final=pd.DataFrame(y_pred)

In [36]:
df

Unnamed: 0,height,weight,insurance,immigrant,contacts_count,house_count,public_transport_count,worried,covid19_positive,covid19_symptoms,...,smoking_quit5,smoking_vape,smoking_yesheavy,smoking_yeslight,smoking_yesmedium,working_home,working_never,working_stopped,working_travel critical,working_travel non critical
0,184,80,True,True,8,5,0,4,True,False,...,0,0,0,0,0,0,0,1,0,0
1,164,84,True,True,8,1,0,4,True,False,...,0,0,0,0,0,0,0,1,0,0
2,170,64,True,True,10,1,0,4,True,False,...,0,0,0,0,0,0,0,1,0,0
3,132,124,True,True,8,10,0,4,True,False,...,1,0,0,0,0,0,0,1,0,0
4,184,118,True,True,8,3,0,4,True,True,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5106,166,66,True,True,1,2,0,4,False,False,...,0,0,0,0,0,0,1,0,0,0
5107,168,70,True,False,2,3,0,4,False,False,...,0,0,0,0,0,0,1,0,0,0
5108,182,82,True,True,3,3,0,4,False,False,...,0,0,0,0,0,0,1,0,0,0
5109,184,80,True,True,0,3,0,3,False,False,...,0,0,0,0,0,0,0,1,0,0


In [42]:
final

Unnamed: 0,0
0,True
1,True
2,False
3,False
4,True
...,...
1478,True
1479,False
1480,False
1481,False


In [41]:
X

Unnamed: 0,height,weight,insurance,immigrant,contacts_count,house_count,public_transport_count,worried,covid19_symptoms,covid19_contact,...,smoking_quit5,smoking_vape,smoking_yesheavy,smoking_yeslight,smoking_yesmedium,working_home,working_never,working_stopped,working_travel critical,working_travel non critical
0,184,80,True,True,8,5,0,4,False,False,...,0,0,0,0,0,0,0,1,0,0
1,164,84,True,True,8,1,0,4,False,False,...,0,0,0,0,0,0,0,1,0,0
2,170,64,True,True,10,1,0,4,False,False,...,0,0,0,0,0,0,0,1,0,0
3,132,124,True,True,8,10,0,4,False,False,...,1,0,0,0,0,0,0,1,0,0
4,184,118,True,True,8,3,0,4,True,True,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5106,166,66,True,True,1,2,0,4,False,False,...,0,0,0,0,0,0,1,0,0,0
5107,168,70,True,False,2,3,0,4,False,False,...,0,0,0,0,0,0,1,0,0,0
5108,182,82,True,True,3,3,0,4,False,False,...,0,0,0,0,0,0,1,0,0,0
5109,184,80,True,True,0,3,0,3,False,False,...,0,0,0,0,0,0,0,1,0,0


In [5]:
o = df.drop('covid19_positive',axis=1)
v = df['covid19_positive']
from sklearn.model_selection import train_test_split
Big,Test, real, actual =train_test_split(o, v, test_size=0.29, stratify=y, random_state=rs) 

In [11]:
Test['Real'] = actual
Test['Prediction']=final


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
Test

Unnamed: 0,level_0,index,height,weight,insurance,immigrant,contacts_count,house_count,public_transport_count,worried,...,smoking_yesheavy,smoking_yeslight,smoking_yesmedium,working_home,working_never,working_stopped,working_travel critical,working_travel non critical,Real,Prediction
0,0,1512,158,98,True,True,4,2,0,4,...,0,0,0,0,0,0,1,0,True,True
1,1,1071,170,70,True,True,5,3,1,3,...,0,0,0,0,1,0,0,0,,True
2,2,5047,174,94,True,True,5,3,0,5,...,0,0,0,0,1,0,0,0,,False
3,3,2921,184,106,True,False,7,4,0,3,...,0,0,0,0,0,1,0,0,,False
4,4,266,154,130,True,True,4,5,0,4,...,0,0,0,0,0,0,1,0,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478,1478,1399,148,92,True,True,4,2,0,4,...,0,0,0,0,0,1,0,0,,True
1479,1479,374,188,114,True,True,6,2,0,4,...,0,0,0,0,1,0,0,0,True,False
1480,1480,2946,190,98,False,True,4,2,1,2,...,0,0,0,0,1,0,0,0,True,False
1481,1481,3765,170,84,True,True,3,3,0,3,...,0,0,0,0,0,0,1,0,True,True


In [13]:
Test=Test[Test['Prediction']==True]

In [14]:
Test.to_excel('NN_Generalization.xlsx','r')

In [15]:
Test

Unnamed: 0,level_0,index,height,weight,insurance,immigrant,contacts_count,house_count,public_transport_count,worried,...,smoking_yesheavy,smoking_yeslight,smoking_yesmedium,working_home,working_never,working_stopped,working_travel critical,working_travel non critical,Real,Prediction
0,0,1512,158,98,True,True,4,2,0,4,...,0,0,0,0,0,0,1,0,True,True
1,1,1071,170,70,True,True,5,3,1,3,...,0,0,0,0,1,0,0,0,,True
4,4,266,154,130,True,True,4,5,0,4,...,0,0,0,0,0,0,1,0,,True
5,5,1662,174,92,True,True,2,2,0,4,...,0,0,0,0,1,0,0,0,,True
7,7,1207,164,60,True,True,3,4,0,4,...,0,0,0,0,0,1,0,0,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,1474,470,170,92,True,True,10,11,0,4,...,0,0,0,0,0,1,0,0,True,True
1477,1477,1570,188,112,True,True,21,11,0,4,...,0,0,0,0,0,0,1,0,,True
1478,1478,1399,148,92,True,True,4,2,0,4,...,0,0,0,0,0,1,0,0,,True
1481,1481,3765,170,84,True,True,3,3,0,3,...,0,0,0,0,0,0,1,0,True,True


In [26]:
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
sns.displot(Test['height'])

AttributeError: module 'seaborn' has no attribute 'displot'

In [28]:
sns.histplot(x='height',data=Test)

AttributeError: module 'seaborn' has no attribute 'histplot'