# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler , Normalizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.stats import norm
from scipy import stats
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Loading Data

In [None]:
covid_data_1 = pd.read_csv(r"data/Cleaned-Data.csv")

pd.pandas.set_option('display.max_columns',None)

# EDA

## `Getting to know data`

In [None]:
display("Peeking into Data", covid_data_1)

## `Size of data`

In [None]:
display("Shape of dataset")
print("Rows:",covid_data_1.shape[0],"\nColumns:",covid_data_1.shape[1])

## `NULL Values`

In [None]:
display("NULL Values", covid_data_1.isnull().sum())

In [None]:
display("Description",covid_data_1.describe())

In [None]:
covid_data_1.info()

## `Checking distribution of data`

In [None]:
#covid_data_1 = covid_data_1.drop('Country',axis=1)
sns.distplot(covid_data_1.drop('Country',axis=1))

In [None]:
for i in covid_data_1.columns:
    print("\nColumn Name:",i,"-->",covid_data_1[i].unique(),"-->Unique Count",len(covid_data_1[i].unique()))

In [None]:
severity_columns = covid_data_1.filter(like='Severity_').columns

In [None]:
covid_data_1['Severity_None'].replace({1:'None',0:'No'},inplace =True)
covid_data_1['Severity_Mild'].replace({1:'Mild',0:'No'},inplace =True)
covid_data_1['Severity_Moderate'].replace({1:'Moderate',0:'No'},inplace =True)
covid_data_1['Severity_Severe'].replace({1:'Severe',0:'No'},inplace =True)

In [None]:
covid_data_1['Condition']=covid_data_1[severity_columns].values.tolist()

In [None]:
def removing(list1):
    list1 = set(list1) 
    list1.discard("No")
    a = ''.join(list1)
    return a

In [None]:
covid_data_1['Condition'] = covid_data_1['Condition'].apply(removing)

## `Grouping by severity`

In [None]:
age_columns = covid_data_1.filter(like='Age_').columns
gender_columns = covid_data_1.filter(like='Gender_').columns
contact_columns = covid_data_1.filter(like='Contact_').columns

In [None]:
No_risk_age = covid_data_1.groupby(['Severity_None'])[age_columns].sum()
No_risk_gender = covid_data_1.groupby(['Severity_None'])[gender_columns].sum()
No_risk_contact = covid_data_1.groupby(['Severity_None'])[contact_columns].sum()

In [None]:
Low_risk_age = covid_data_1.groupby(['Severity_Mild'])[age_columns].sum()
Low_risk_gender = covid_data_1.groupby(['Severity_Mild'])[gender_columns].sum()
Low_risk_contact = covid_data_1.groupby(['Severity_Mild'])[contact_columns].sum()

In [None]:
Moderate_risk_age = covid_data_1.groupby(['Severity_Moderate'])[age_columns].sum()
Moderate_risk_gender = covid_data_1.groupby(['Severity_Moderate'])[gender_columns].sum()
Moderate_risk_contact = covid_data_1.groupby(['Severity_Moderate'])[contact_columns].sum()

In [None]:
Severe_risk_age = covid_data_1.groupby(['Severity_Severe'])[age_columns].sum()
Severe_risk_gender = covid_data_1.groupby(['Severity_Severe'])[gender_columns].sum()
Severe_risk_contact = covid_data_1.groupby(['Severity_Severe'])[contact_columns].sum()

In [None]:
sns.countplot(covid_data_1['Condition'])

# Preprocessing

In [None]:
covid_data_1.drop("Country",axis=1,inplace=True)

In [None]:
covid_data_1.drop(severity_columns,axis=1,inplace=True)

In [None]:
covid_data_1['Symptoms_Score'] = covid_data_1.iloc[:,:5].sum(axis=1) + covid_data_1.iloc[:,6:10].sum(axis=1)

In [None]:
covid_data_1.shape

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
covid_data_1['Condition'] = le.fit_transform(covid_data_1['Condition'])

In [None]:
covid_data_1

# Feature Engineering

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 13, 18
corrmat = covid_data_1.corr()
k = 22
cols = corrmat.nlargest(k, 'Condition')['Condition'].index
cm = np.corrcoef(covid_data_1[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

# Model

In [None]:
X= covid_data_1.drop(['Condition'],axis=1)
y= covid_data_1['Condition']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

- Random Forest

In [None]:
'''from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)'''

In [None]:
'''# Fit the grid search to the data
grid_search.fit(X_train, y_train)'''

In [None]:
'''print('Best Parameters',grid_search.best_params_)
best_grid = grid_search.best_estimator_
print('\n Best Estimator',best_grid)'''

In [None]:
"""Best Parameters {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}
Best Estimator RandomForestClassifier(max_depth=4, max_features='sqrt')"""

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc1=RandomForestClassifier(criterion= 'gini', max_depth= 4, max_features= 'sqrt', n_estimators= 100)

In [None]:
rfc1.fit(X_train, y_train)

In [None]:
pred=rfc1.predict(X_test)

In [None]:
pred

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

- Catboost

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(iterations=200)

In [None]:
categorical_var = np.where(X_train.dtypes != np.float)[0]
print('\nCategorical Variables indices : ',categorical_var)

In [None]:
model.fit(X_train,y_train,cat_features = categorical_var,plot=False)

In [None]:
predict_train = model.predict(X_train)
print('\nTarget on train data',predict_train)

In [None]:
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)

In [None]:
predict_test = model.predict(X_test)
print('\nTarget on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)

- Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'lbfgs')
model.fit(X_train, y_train)

In [None]:
# use the model to make predictions with the test data
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
'''from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(X_train, y_train)'''

In [None]:
'''y_pred_knn = knn.predict(X_test)'''

In [None]:
'''from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_knn)'''

In [None]:
'''from sklearn.svm import SVC

svm = SVC(kernel='linear',C=0.025, random_state=101)

svm.fit(X_train, y_train)'''

In [None]:
'''y_pred_svc = svc.predict(X_test)'''

In [None]:
'''from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_svc)'''

In [None]:
from sklearn.naive_bayes import MultinomialNB

mb = MultinomialNB()

mb.fit(X_train, y_train)

In [None]:
y_pred_mb = mb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_mb)

- Neural network

In [None]:
from keras.utils.np_utils import to_categorical
y_train = to_categorical(y_train, num_classes = 4)
y_train.shape

In [None]:
from keras.layers import Input,InputLayer, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout
from keras.models import Sequential,Model
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint,LearningRateScheduler
import keras
from keras import backend as K

In [None]:
model=keras.models.Sequential()
#model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(128,activation='relu'))
model.add(keras.layers.Dense(128,activation='relu'))
model.add(keras.layers.Dense(128,activation='relu'))
model.add(keras.layers.Dense(128,activation='relu'))
model.add(keras.layers.Dense(128,activation='relu'))
model.add(keras.layers.Dense(4,activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
                   
model.fit(X_train, y_train,epochs=10, batch_size=32, verbose=1)

In [None]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred,axis=1)

In [None]:
y_pred