In [None]:
import pandas as pd
import numpy as np
import statistics
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.model_selection import KFold

from sklearn.model_selection import RandomizedSearchCV

from mlxtend.preprocessing import minmax_scaling

In [None]:
train = pd.read_csv("C:/Users/virat/Desktop/7150/train_data.csv", index_col=0)
test = pd.read_csv("C:/Users/virat/Desktop/7150/test_data.csv", index_col=0)

In [None]:
# Display dataset
train

In [None]:
# Columns with their dataset
train.info()

In [None]:
# Describes the data
train.describe()

In [None]:
# Get the total number of missing values
train.isna().sum()

In [None]:
# If the .csv file is opened in excel11-20 is converted to Nov-20
train = train.replace('Nov-20', '11-20')
test = test.replace('Nov-20', '11-20')

In [None]:
# Fill the missing values using statistics module
train['Bed Grade'].fillna(statistics.mode(train['Bed Grade']),inplace=True)
train['City_Code_Patient'].fillna(statistics.mode(train['City_Code_Patient']),inplace=True)

In [None]:
# Columns are of no use so we are removing them
train.drop(['case_id', 'patientid'], axis=1, inplace=True)

In [None]:
# We can't predict 11 classes so we are merging them into 3 classes to predict
train['Stay'].replace('More than 100 Days', '>100', inplace=True)
train['Stay']= train['Stay'].replace(
    {'0-10':0, '11-20':0, '21-30':1, '31-40':1, '41-50':1, '51-60':2,'61-70':2,'71-80':2,'81-90':2,'91-100':2,'>100':2})

In [None]:
# Dividing columns into categorical and numerical columns
cat_cols=[]
num_cols=[]

for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
    else:
        num_cols.append(col)

In [None]:
# Plot of count of values into each variables
i=1
plt.figure(figsize=(15,20))
for col in cat_cols:
    plt.subplot(5,2,i)
    sns.countplot(train[col])
    i=i+1
plt.show()

In [None]:
# Plot of density of values into each variables
i=1
plt.figure(figsize=(15,20))
for col in num_cols:
    plt.subplot(4,2,i)
    sns.distplot(train[col])
    i=i+1
plt.show()

In [None]:
# Changing some columns from numerical type to categorical as they fit better
cat_cols.append('Bed Grade')
cat_cols.append('City_Code_Hospital')
cat_cols.append('City_Code_Patient')

num_cols.remove('Bed Grade')
num_cols.remove('City_Code_Hospital')
num_cols.remove('City_Code_Patient')

In [None]:
# Encoding categorical columns
le= LabelEncoder()
for col in cat_cols:
    train[col]= le.fit_transform(train[col])

In [None]:
# Standardizing numerical columns
ss= StandardScaler()
train[num_cols]= ss.fit_transform(train[num_cols].values)

In [None]:
# Correlation heatmap of dataset variables
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')

In [None]:
# Plotting skewness of these 3 columns
num_data = train[['Available Extra Rooms in Hospital', 'Bed Grade', 'Admission_Deposit']]
fig, ax =plt.subplots(2,2, figsize=(14,10))
fig.tight_layout(pad=5.0)

for ax, n in zip(ax.flatten(), num_data.columns.tolist()):
    sns.distplot(ax=ax, a=num_data[n].dropna(), label="Skewness : %.2f"%(num_data[n].skew()))
    ax.set_title(n, fontsize = 14)
    ax.legend(loc = 'best')

In [None]:
# Removing predicting columns from training file
y= train['Stay']
X= train.drop('Stay', axis=1)

In [None]:
# Box-plotting to see outliers in dataset
sns.boxplot(x = 'Stay', y = 'Age', data = train)

In [None]:
# Plotting scaling of dataset
scaled_data = minmax_scaling(train['Age'], columns=[0])

fig, ax = plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(train['Age'], ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(scaled_data, ax=ax[1], kde=True, legend=False)
ax[1].set_title("Scaled data")
plt.show()

In [None]:
# Plotting normalization of dataset
normalized_data = stats.boxcox(train['Stay'])

fig, ax=plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(train['Stay'], ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(normalized_data[0], ax=ax[1], kde=True, legend=False)
ax[1].set_title("Normalized data")
plt.show()

In [None]:
# Changing k-values to see the difference
kf = KFold(n_splits = 2)
# kf = KFold(n_splits = 3)
# kf = KFold(n_splits = 5)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
# Running Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

confusion = confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion)
cm_display.plot()
plt.show()

print(precision_score(y_test,y_pred, average='macro'))
print(recall_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='micro'))

In [None]:
# Running Random Forest
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

confusion = confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion)
cm_display.plot()
plt.show()

print(precision_score(y_test,y_pred, average='macro'))
print(recall_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='micro'))

In [None]:
# Running K-Nearest Neighbours
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

confusion = confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion)
cm_display.plot()
plt.show()

print(precision_score(y_test,y_pred, average='macro'))
print(recall_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='micro'))

In [None]:
# Running Gradient Boosting, this is the slowest
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

confusion = confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion)
cm_display.plot()
plt.show()

print(precision_score(y_test,y_pred, average='macro'))
print(recall_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='micro'))

In [None]:
# Running Extreme Gradient Boosting
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

confusion = confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion)
cm_display.plot()
plt.show()

print(precision_score(y_test,y_pred, average='macro'))
print(recall_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='macro'))
print(f1_score(y_test,y_pred, average='micro'))

In [None]:
# Splitting of dataset normally
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size= 0.2, stratify=y, random_state=42)

In [None]:
# Running all the five algorithms
modells = [LogisticRegression(max_iter=1000), RandomForestClassifier(), KNeighborsClassifier(), 
           GradientBoostingClassifier(), XGBClassifier()]

name = ['LogisticRegression', 'RandomForsetClassifier', 'KNeighborsClassifier', 
        'GradientBoostingClassifier', 'XGBClassifier']

models= dict(zip(name,modells))
accuracy_scores=[]
for key,value in models.items():
    value.fit(X_train,y_train)
    y_pred= value.predict(X_test)
    accuracy= accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    print(key)
    print(accuracy)

In [None]:
# Comparing all the models and plotting
sns.barplot(x= ['LR','RF','KNN','GBC','XGB'],y=accuracy_scores)

In [None]:
# Hyperparameter tuning Logistic Regression
model1 = LogisticRegression(max_iter=1000)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

params1 = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search1 = RandomizedSearchCV(estimator=model1,  cv=3, param_distributions=params1, n_iter=5)
grid_result1 = grid_search1.fit(X, y)

print("Best: %f using %s" % (grid_result1.best_score_, grid_result1.best_params_))

In [None]:
# Hyperparameter tuning Random Forest
model2 = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

params2 = dict(n_estimators=n_estimators,max_features=max_features)
grid_search2 = RandomizedSearchCV(estimator=model2,  cv=3, param_distributions=params2, n_iter=5)
grid_result2 = grid_search2.fit(X, y)

print("Best: %f using %s" % (grid_result2.best_score_, grid_result2.best_params_))

In [None]:
# Hyperparameter tuning K-Nearest Neighbours
model3 = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

params3 = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
grid_search3 = RandomizedSearchCV(estimator=model3,  cv=3, param_distributions=params3, n_iter=5)
grid_result3 = grid_search3.fit(X, y)

print("Best: %f using %s" % (grid_result3.best_score_, grid_result3.best_params_))

In [None]:
# Hyperparameter tuning Gradient Boosting, this is sooooo slow
model4 = GradientBoostingClassifier()
n_estimators2 = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]

params4 = dict(learning_rate=learning_rate, n_estimators=n_estimators2, subsample=subsample, max_depth=max_depth)
grid_search4 = RandomizedSearchCV(estimator=model4,  cv=3, param_distributions=params4, n_iter=5)
grid_result4 = grid_search4.fit(X, y)

print("Best: %f using %s" % (grid_result4.best_score_, grid_result4.best_params_))

In [None]:
# Hyperparameter tuning Extreme Gradient Boosting
model5 = XGBClassifier(n_estimators=1000)
objective = ['binary:logistic']
max_depth2 = [3,4,5,6]
min_child_weight = [1,5,10,12]
subsample2 = [0.6,0.8,1.0]
colsample_bytree = [0.6,0.8,1.0]
gamma = [0.5,1,1.5,2]

params5 = dict(objective=objective, max_depth=max_depth2, min_child_weight=min_child_weight, subsample=subsample2,
            colsample_bytree=colsample_bytree, gamma=gamma)
grid_search5 = RandomizedSearchCV(estimator=model5,  cv=3, param_distributions=params5, n_iter=5)
grid_result5 = grid_search5.fit(X, y)

print("Best: %f using %s" % (grid_result5.best_score_, grid_result5.best_params_))

In [None]:
# Experimenting to see performance of all models (can't run it, crashes my system)
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)