In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate


%matplotlib inline

In [2]:
names = ['index',
         'location','country','gender',
        'age','vis_wuhan','from_wuhan',
        'symptom1','symptom2','symptom3',
        'symptom4', 'symptom5', 'symptom6', 'diff_sym_hos','result']
df=pd.read_csv("data.csv",index_col=0,skiprows=1,names=names)

In [3]:
Y=df.result.values
X=df[['gender',
        'age','vis_wuhan','from_wuhan',
        'symptom1','symptom2','symptom3',
        'symptom4', 'symptom5', 'symptom6', 'diff_sym_hos']]


In [4]:
xTrain, xTest, yTrain, yTest=train_test_split(X,Y,test_size=0.1,random_state=3, shuffle=True)
print(len(xTrain))
print(len(X))

776
863


In [5]:
# Use Kfolds to split the data into 10 folds
x_train_data_sets=list()
y_train_data_sets=list()
kf=KFold(n_splits=5, random_state=3, shuffle=True)
for train_index,trainset_index in kf.split(xTrain,yTrain):
    X_train, X_trainset = xTrain.iloc[train_index], xTrain.iloc[trainset_index]
    x_train_data_sets.append(X_trainset)
    y_train, y_trainset = yTrain[train_index], yTrain[trainset_index]
    y_train_data_sets.append(y_trainset)
x_train_data_sets



[       gender   age  vis_wuhan  from_wuhan  symptom1  symptom2  symptom3  \
 index                                                                      
 764         0  49.4          0           0        14        31        19   
 195         0  45.0          1           1         6         7        19   
 58          0  32.0          1           0        14        31        19   
 535         1  70.0          0           0        14        31        19   
 842         1  49.4          0           0        14        31        19   
 ...       ...   ...        ...         ...       ...       ...       ...   
 622         2  49.4          0           0        14        31        19   
 581         1  60.0          0           1        14        31        19   
 423         0  46.0          0           0        14        31        19   
 119         1  44.0          1           0        14        31        19   
 643         2  49.4          0           0        14        31        19   

# KNN

In [6]:
acc = []
for i in range(5):
    print("TRAINING SET ", i)
    x = x_train_data_sets[i]  # transposing it
    y = y_train_data_sets[i]
    acc = []
    for i in range(1,27):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(x,y)
        y_pred = knn.predict(xTest)
        acc.append(metrics.accuracy_score(yTest,y_pred))
    print(max(acc)," at knn ",(acc.index(max(acc))+1))
   

TRAINING SET  0
0.9540229885057471  at knn  1
TRAINING SET  1
0.9425287356321839  at knn  3
TRAINING SET  2
0.9310344827586207  at knn  2
TRAINING SET  3
0.9310344827586207  at knn  1
TRAINING SET  4
0.9310344827586207  at knn  9


In [11]:
# the Best acc at k = 1

knn_optimal= KNeighborsClassifier(n_neighbors=17)
knn_optimal.fit(x_train_data_sets[0],y_train_data_sets[0])
y_pred = knn_optimal.predict(xTest)



In [12]:
knn_confusion_metrics = confusion_matrix(yTest,y_pred)

In [13]:
knn_confusion_metrics

array([[76,  3],
       [ 6,  2]], dtype=int64)

In [14]:
f1_score(yTest,y_pred)

0.3076923076923077

# Naive

In [24]:
from sklearn.naive_bayes import GaussianNB
acc=[]
gb=GaussianNB()
cross=cross_validate(gb,xTrain,yTrain,cv=8,return_estimator=True)
cross['estimator'][0]

GaussianNB()

In [None]:
#Looking on the 
x_train_s=pd.concat([xTrain[ :int((5*len(xTrain))/8) ] , xTrain[ int((6*len(xTrain))/8): ]])
y_train_s=np.concatenate([yTrain[ :int((5*len(xTrain))/8) ] , yTrain[ int((6*len(xTrain))/8): ]])
naive_optimal = GaussianNB()
naive_optimal.fit(x_train_s,y_train_s)
y_pred=naive_optimal.predict(xTrain[ int((5*len(xTrain))/8):int((6*len(xTrain))/8) ])
print("acc : ",metrics.accuracy_score(yTrain[ int((5*len(xTrain))/8):int((6*len(xTrain))/8) ],y_pred))

naive_confusion_metrics = confusion_matrix(yTrain[ int((5*len(xTrain))/8):int((6*len(xTrain))/8) ],y_pred)
naive_f1_score=f1_score(yTrain[ int((5*len(xTrain))/8):int((6*len(xTrain))/8) ],y_pred)
print("Confusion Metrics : ",naive_confusion_metrics)
print("F1 Score : ",naive_f1_score)


In [17]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state=0,solver='lbfgs', max_iter=1000)
logistic.fit(xTrain,yTrain)
y_pred=logistic.predict(xTest)
logistic_confusion_metrics = confusion_matrix(yTest,y_pred)
logistic_f1_score=f1_score(yTest,y_pred)
print("acc : ",metrics.accuracy_score(yTest,y_pred))
print("Confusion Metrics : ",logistic_confusion_metrics)
print("F1 Score : ",logistic_f1_score)

acc :  0.9310344827586207
Confusion Metrics :  [[76  3]
 [ 3  5]]
F1 Score :  0.625
