In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection  
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus
import random
import numpy as np
import random
import pickle
import pandas as pd
import matplotlib.pyplot as plt

# STEP 1: LOADING THE DATASET AND PRINTING IT

In [16]:
dataSet = pd.read_csv("data.csv")
print (dataSet)

     Unnamed: 0  location  country  gender   age  vis_wuhan  from_wuhan  \
0             0       104        8       1  66.0          1           0   
1             1       101        8       0  56.0          0           1   
2             2       137        8       1  46.0          0           1   
3             3       116        8       0  60.0          1           0   
4             4       116        8       1  58.0          0           0   
..          ...       ...      ...     ...   ...        ...         ...   
858         858        48        3       2  24.0          0           0   
859         859         0        0       2  35.0          0           0   
860         860         3        1       1  49.4          0           0   
861         861        24        9       1  49.4          0           0   
862         862        15       27       1  70.0          0           0   

     symptom1  symptom2  symptom3  symptom4  symptom5  symptom6  diff_sym_hos  \
0          14     

In [17]:
one_hot_encoded_data = pd.get_dummies(dataSet, columns = ['location', 'country', 'gender', 'symptom1', 'symptom2', 'symptom3', 'symptom4', 'symptom5', 'symptom6'])
df = pd.DataFrame(one_hot_encoded_data)
first_column = df.pop('result')
df.insert(277, 'result', first_column)
print(df)

     Unnamed: 0   age  vis_wuhan  from_wuhan  diff_sym_hos  location_0  \
0             0  66.0          1           0             8           0   
1             1  56.0          0           1             0           0   
2             2  46.0          0           1            13           0   
3             3  60.0          1           0             0           0   
4             4  58.0          0           0             0           0   
..          ...   ...        ...         ...           ...         ...   
858         858  24.0          0           0             0           0   
859         859  35.0          0           0             0           1   
860         860  49.4          0           0             0           0   
861         861  49.4          0           0             0           0   
862         862  70.0          0           0             0           0   

     location_1  location_2  location_3  location_4  ...  symptom4_10  \
0             0           0           

# STEP 2: SCALE THE FEATURES AND SPLIT THE DATASET INTO X & Y

In [18]:
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(df)
x = scaled[0:864, 1:276]
y = scaled[0:864, 277]
print("X:", x)
print("Y:", y)

X: [[0.68085106 1.         0.         ... 0.         1.         0.        ]
 [0.57446809 0.         1.         ... 0.         1.         0.        ]
 [0.46808511 0.         1.         ... 0.         1.         0.        ]
 ...
 [0.50425532 0.         0.         ... 0.         1.         0.        ]
 [0.50425532 0.         0.         ... 0.         1.         0.        ]
 [0.72340426 0.         0.         ... 0.         1.         0.        ]]
Y: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

# STEP 3: SPLIT THE DATA INTO TRAIN, VALIDATION, AND TEST

In [19]:
x_temp, xTest, y_temp, yTest = train_test_split(x,y,train_size=0.8, random_state=5 ,shuffle=True)
xTrain, xValidation, yTrain, yValidation =  train_test_split(x_temp,y_temp,train_size=0.75, random_state=5 ,shuffle=True)
print(len(xTrain))
print(len(xValidation))
print(len(xTest))

517
173
173


# STEP 4: TRAIN THE MODEL AND VALIDATE IT

In [20]:
clf = DecisionTreeClassifier(criterion="entropy")

# Train Decision Tree Classifer
clf = clf.fit(xTrain,yTrain)

#Predict the response for validation dataset
y_pred = clf.predict(xValidation)         
print(y_pred)
print ("Accuracy", metrics.accuracy_score(yValidation, y_pred))

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0.]
Accuracy 0.9479768786127167


# STEP 5: SHOW THE DECISION TREE

In [21]:
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = scaled[0,1:276],class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png("Covid-19.png")
Image(graph.create_png())

InvocationException: Program terminated with status: 1. stderr follows: 'C:\Users\UNiversal' is not recognized as an internal or external command,
operable program or batch file.


# STEP 6: MEASURE ACCURACY ON TEST DATA

In [None]:
print(clf.score(xTest, yTest))

# STEP 7: CALCULATING THE PERFORMANCE METRICS

In [None]:
precision = precision_score(yTest, y_pred)
print('Precision: %f' % precision)
recall = recall_score(yTest, y_pred)
print('Recall: %f' % recall)
f1 = f1_score(yTest, y_pred)
print('F1 score: %f' % f1)
auc = roc_auc_score(yTest, y_pred)
print('ROC AUC: %f' % auc)
plot_confusion_matrix(clf, xTest, yTest)
plt.show()

# STEP 8: GRAPHING THE ROC CURVE

In [None]:
fpr, tpr, threshold = metrics.roc_curve(yTest, y_pred)
roc_auc = metrics.auc(fpr, tpr)


plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()