In [None]:
#import libraries and configuration
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn import metrics

from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.ensemble import RandomForestClassifier
from keras.optimizers import Adam
from tensorflow import keras

import sys
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
#retrieve data
df = pd.read_csv('latestdata2.csv')

In [None]:
#check the number of rows and columns in the df
rows, columns = df.shape
print('Number of rows: ', rows)
print('Number of columns: ', columns)

In [None]:
#check the columns names and data types
print(df.dtypes)

In [None]:
#View first five rows of dataset
print(df.head())

In [None]:
#check for missing data
print(df.isnull().sum())

#Data Cleaning - Coulumn Removal

In [None]:
#copy dataframe to new dataframe for cleaning
raw = df.copy()

In [None]:
#check and remove duplicate columns
raw = raw.drop(df[df.age == 'age'].index)

In [None]:
#remove irrelevant columns

raw.drop("Unnamed: 3",axis=1,inplace=True)
raw.drop("Unnamed: 21",axis=1,inplace=True)
raw.drop("Unnamed: 22",axis=1,inplace=True)
raw.drop("Unnamed: 23",axis=1,inplace=True)
raw.drop("Unnamed: 24",axis=1,inplace=True)
raw.drop("Unnamed: 25",axis=1,inplace=True)
raw.drop("Unnamed: 26",axis=1,inplace=True)
raw.drop("id",axis=1,inplace=True)
raw.drop("location",axis=1,inplace=True)
raw.drop("country",axis=1,inplace=True)
raw.drop("If_onset_approximated",axis=1,inplace=True)
raw.drop("case_in_country",axis=1,inplace=True)
raw.drop("reporting date",axis=1,inplace=True)
raw.drop("summary",axis=1,inplace=True)
raw.drop("source",axis=1,inplace=True)
raw.drop("link",axis=1,inplace=True)

In [None]:
#print the modified dataset information
nrows, ncolumns = raw.shape
print(columns - ncolumns, 'irrelevant coulumns detected & removed.')
print('updated number of rows: ', nrows)
print('updated number of columns: ', ncolumns)

In [None]:
#check the unique values in each column
for i in raw.columns:
    print('Column: ', i)
    print('Unique Values: ', len(raw[i].unique()))
    print(raw[i].unique())
    print('######################################################')

Data Cleaning

Fixing uniqueness in columns

In [None]:
#fix uniqueness in columns death column
raw['death'] = raw['death'].replace(regex=r'([0-9]+(/[0-9]+)+)', value='1')

#fix uniqueness in columns recovered column
raw['recovered'] = raw['recovered'].replace(regex=r'/2020+', value='/20')

#fix uniqueness in columns recovered column
raw['recovered'] = raw['recovered'].replace(regex=r'/1899', value='/20')

#fix uniqueness in columns recovered column
raw['hosp_visit_date'] = raw['hosp_visit_date'].replace(regex=r'2//2020+', value='2/05/2020')

#fix uniqueness in columns recovered column
raw['hosp_visit_date'] = raw['hosp_visit_date'].replace(regex=r'/2020+', value='/20')

#fix uniqueness in columns recovered column
raw['hosp_visit_date'] = raw['hosp_visit_date'].replace(regex=r'/2019+', value='/19')

Fix missing data for categorical data 

In [None]:
#Fix missing data - gender
raw['gender'] = raw['gender'].fillna(raw['gender'].mode()[0])
#Fix missing data - wuhan
raw['from Wuhan'] = raw['from Wuhan'].fillna(raw['from Wuhan'].mode()[0])
#Fix missing data - symptom
raw['symptom'] = raw['symptom'].fillna(raw['symptom'].mode()[0])

Fix missing data for date data

In [None]:
#Fix missing data - exposure start
raw['exposure_start'] = raw['exposure_start'].fillna(raw['exposure_start'].mode()[0])
#Fix missing data - exposure start
raw['exposure_end'] = raw['exposure_end'].fillna(raw['exposure_end'].mode()[0])
#Fix missing data - hosp_visit_date
raw['hosp_visit_date'] = raw['hosp_visit_date'].fillna(raw['hosp_visit_date'].mode()[0])
#Fix missing data - recovered
raw['recovered'] = raw['recovered'].replace('1', raw['recovered'].mode()[0])

Fix missing data for numerical data

In [None]:
meanAge=raw['age'].mean()
meanAge = int(meanAge)
# Replace NaNs in column age with the
# mean of values in the same column
raw['age'].fillna(value=meanAge, inplace=True)

In [None]:
#boxplot of average_age
fig, ax = plt.subplots(figsize = (10,6))
ax.set_title('Basic Plot')
ax.boxplot(raw['age'], vert=False)

In [None]:
#check for missing values
print(raw.isnull().sum())
raw.head()

Dealing with outliers

In [None]:
#using the interquartile range rule to deal with outliers

def impute_outliers_IQR(raw):
    q1=raw.quantile(0.25)
    q3=raw.quantile(0.75)
    IQR=q3-q1
    upper = raw[~(raw>(q3+1.5*IQR))].max()
    lower = raw[~(raw<(q1-1.5*IQR))].min()
    raw = np.where(raw > upper,raw.median(), np.where(raw < lower, raw.median(),raw))
    return raw

raw['age'] = impute_outliers_IQR(raw['age'])

In [None]:
raw['age'].describe()

Create the outcome column

In [None]:
#for outcome, 0 is recovered, 1 is death
raw['outcome'] = raw['death'].apply(lambda value : 0 if value == '0' else 1)

Convert gender to numerical value

In [None]:
#for outcome, 0 is male, 1 is female
raw['sex'] = raw['gender'].apply(lambda value : 0 if value == 'male' else 1)

End of Data Cleaning

In [None]:
raw.head()

In [None]:
raw.to_csv('latestdata2_clean.csv', columns=raw.columns, index=False)

columns = raw.columns
print(columns)

Clean Data for part 1

In [None]:
df_clean = raw.copy()

In [None]:
df_clean.head()

Correlation Analysis

In [None]:
correlation = df_clean.corr()

#to see in graph
fig, ax = plt.subplots(figsize=(11, 9))
plt.show(sns.heatmap(correlation, annot=True, cmap='coolwarm'))
#to see in console
print(df_clean[df_clean.columns[1:]].corr()['outcome'])

In [None]:
sns.catplot('outcome', 'age', data=df_clean)

In [None]:
sns.catplot('outcome', 'sex', data=df_clean)

In [None]:
sns.catplot('outcome', 'from Wuhan', data=df_clean)

In [None]:
sns.catplot('outcome', 'visiting Wuhan', data=df_clean)

In [None]:
df_pca = df_clean.copy()

# normalize the data - the StandardScaler is used standardize the dataset’s 
# features onto unit scale (mean = 0 and variance = 1) which is a requirement for the optimal performance 
scaler = StandardScaler()
#x = scaled_newdf1.loc[:,].values
df_pca = scaler.fit_transform(df_pca[['age','visiting Wuhan', 'from Wuhan', 'sex']])

#check whether the normalized data has a mean of zero and a standard deviation of one.
print('mean of normalized data: ',np.mean(df_pca))
print('standard deviation of normalized data: ',np.std(df_pca))

#perform PCA
pca = PCA(n_components=2)
PCA_val = pca.fit_transform(df_pca)
data_PCA = pd.DataFrame(data = PCA_val,columns = ['Principal Component 1', 'Principal Component 2'])
fig, ax = plt.subplots()
ax.scatter(data_PCA['Principal Component 1'], data_PCA['Principal Component 2'], s = 5)
ax.grid()
#print(data_PCA['Principal Component 1'], data_PCA['Principal Component 2'])

In [None]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Bayes Net

In [None]:
#Question 2A
#P(Symptoms_onset=Date|visiting Wuhan=1)

df_bayesA = df_clean.copy()

#build a dataset by removing values of visiting Wuhan = 0
df_bayesA.drop(df.loc[df_bayesA['visiting Wuhan']==0].index, inplace=True)

#Get the number of rows of the new data set
rows, columns = df_bayesA.shape

#get count of N/A in reduced data set
sumNa = df_bayesA['symptom_onset'].isna().sum()

#P(Symptoms_onset=Date|visiting Wuhan=1) = 1-df_bayes.count(N/A)/df_bayes.rows
probability = 1-(sumNa/rows)

print('P(Symptoms_onset=Date|visiting Wuhan=1) = ',probability )

In [None]:
#P(recovered=Date|Symptoms_onset=Date,visiting Wuhan=1)

df_bayesB = df_clean.copy()

#build a dataset by removing values of visiting Wuhan = 0
df_bayesB.drop(df.loc[df_bayesB['visiting Wuhan']==0].index, inplace=True)

#build a dataset by removing values of visiting symptom_onset = N/A
df_bayesB['symptom_onset'].dropna(inplace=True)

#Get the number of rows of the new data set
rows, columns = df_bayesB.shape

#get count of recovered = 0 in reduced data set
sumFalsePatient = (df_bayesB['recovered'] == 0).sum()

#P(recovered=Date|Symptoms_onset=Date,visiting Wuhan=1) = 1-df_bayes.count(falsePatient)/df_bayes.rows
probability = 1-(sumFalsePatient/rows)

print('P(recovered=Date|Symptoms_onset=Date,visiting Wuhan=1) = ',probability )

In [None]:
#Question 2C
#P(outcome=1|visiting Wuhan=1)

df_bayesC = df_clean.copy()

#build a dataset by removing values of visiting Wuhan = 0
df_bayesC.drop(df.loc[df_bayesC['visiting Wuhan']==0].index, inplace=True)

#Get the number of rows of the new data set
rows, columns = df_bayesC.shape

#get count of death in reduced data set
sumDeath = (df_bayesC['outcome'] == 1).sum()

#P(outcome=1|visiting Wuhan=1) = df_bayesC.count(death)/df_bayes.rows
probability = (sumDeath/rows)

print('P(outcome=death|visiting Wuhan=1) = ',probability )

In [None]:
#Questoion 2D
df_bayesD = df_clean.copy()

def getInterval(date1,date2):
    try:
        if(date1 != '0' and date2 != '0'):
            date1 = str(date1)
            date2 = str(date2)
            recoveredDate = datetime.strptime(date1, '%m/%d/%y')
            admittedDate = datetime.strptime(date2, '%m/%d/%y')
            interval = recoveredDate - admittedDate
            return abs(interval.days)
            #return interval.days
        else:
            #print(str(date1) + '#' + str(date2))
            return 0
    except:
        #print(str(date1) + '#' + str(date2))
        return 0

df_bayesD.drop(df.loc[df_bayesD['visiting Wuhan']==0].index, inplace=True)
#df_bayesD["recovery_admitted"] = df_bayesD['recovered'].astype(str) +"&&"+ df_bayesD["hosp_visit_date"]

df_bayesD['recovery_interval'] = df_bayesD.apply(lambda row: getInterval(row.recovered, row.hosp_visit_date), axis=1)

df_bayesD['recovery_interval'] = df_bayesD['recovery_interval'].replace(0, np.NaN)
df_bayesD['recovery_interval'] = df_bayesD['recovery_interval'].replace(342, np.NaN)

AverageRecoveryInterval = df_bayesD['recovery_interval'].mean()
print('The average recovery time for a person who visited Wuhan is', AverageRecoveryInterval)

Machine Learning

In [None]:
#Question 3A

df_KNNA = df_clean.copy()

X_train, X_test, y_train, y_test = train_test_split(df_KNNA[['age', 'sex', 'visiting Wuhan', 'from Wuhan']],df_KNNA['outcome'], test_size=0.25)
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print('Classification Quality using KNN for K=3: ')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#Question 3B

df_KNNB = df_clean.copy()

X_train, X_test, y_train, y_test = train_test_split(df_KNNB[['age', 'sex', 'visiting Wuhan', 'from Wuhan', 'outcome']],df_KNNB['age'], test_size=0.25)
reg = KNeighborsRegressor(n_neighbors=3) 
reg.fit(X_train, y_train) 
y_pred = reg.predict(X_test)

print('Quality of Prediction using Regression:')
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
#Import required module
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df_KNNC = df_clean.copy()

# Silhouette Score for K means
# Import ElbowVisualizer
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,30),metric='silhouette', timings= True)
visualizer.fit(df_KNNC[['age', 'sex', 'visiting Wuhan', 'from Wuhan', 'outcome']])        # Fit the data to the visualizer
visualizer.show() 

pca = PCA(2)
    
#Transform the data
df_KNNC = pca.fit_transform(df_KNNC[['age', 'sex', 'visiting Wuhan', 'from Wuhan', 'outcome']])
 
df_KNNC.shape
 
#Initialize the class object
kmeans = KMeans(n_clusters= 3)
 
#predict the labels of clusters.
label = kmeans.fit_predict(df_KNNC)
 
print(label)

#Getting unique labels
u_labels = np.unique(label)
 
#plotting the results:
for i in u_labels:
    plt.scatter(df_KNNC[label == i , 0] , df_KNNC[label == i , 1] , label = i)

plt.legend()
plt.show()

In [75]:
#Questoion 4C

df_gridSearch = df_clean.copy()

#Splitting Data into test and train set
X_train, X_test, y_train, y_test = train_test_split(df_gridSearch[['age', 'sex', 'visiting Wuhan', 'from Wuhan']],df_gridSearch['outcome'], test_size=0.25, random_state = 42)

# Transform the training data
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

#building the random forest classifier
rfc = RandomForestClassifier()
forest_params = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,14))}]

#Initializing GridSearchCV() object
grid_results = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy')

#fitting GridSearchCV() object with hyperparameters
grid_results.fit(X_train, y_train)

# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))

Best: 0.9447003914483589, using {'max_depth': 10, 'max_features': 2}
