#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#### Importing the data

In [2]:
dataset = pd.read_csv('train_sah1.csv', usecols=['Age', "Sex","Hypertension", "Diabetes","WFNS grade ","CT grade Fischer","Radiological Vasospasm", "Ischemic deficits", "Outcome at Discharge", "Outcome at 3 months"] )
dataset.head()  # To visualize the data

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
0,56,M,Y,N,1,1.0,Y,Y,3,1.0
1,54,M,Y,N,1,1.0,N,N,5,5.0
2,66,M,Y,N,2,3.0,Y,N,5,5.0
3,46,M,Y,Y,1,1.0,N,N,5,5.0
4,67,F,N,N,2,2.0,N,N,5,5.0


#### Now to convert the Yes/No to 1/0

In [3]:
dataset["Hypertension"] = dataset["Hypertension"].map({'Y':1 ,'N':0})
dataset["Diabetes"] = dataset["Diabetes"].map({'Y':1. ,'N':0.})
dataset["Radiological Vasospasm"] = dataset["Radiological Vasospasm"].map({'Y':1. ,'N':0.})
dataset["Ischemic deficits"] = dataset["Ischemic deficits"].map({'Y':1. ,'N':0.})
dataset["Sex"] = dataset["Sex"].map({'F':1. ,'M':0.})
dataset.head() #Checking again

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
0,56,0.0,1.0,0.0,1,1.0,1.0,1.0,3,1.0
1,54,0.0,1.0,0.0,1,1.0,0.0,0.0,5,5.0
2,66,0.0,1.0,0.0,2,3.0,1.0,0.0,5,5.0
3,46,0.0,1.0,1.0,1,1.0,0.0,0.0,5,5.0
4,67,1.0,0.0,0.0,2,2.0,0.0,0.0,5,5.0


#### Normalizing the age

In [4]:
# Assuming same lines from your example
cols_to_norm = ['Age',]
dataset[cols_to_norm] = dataset[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
display(dataset)

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
0,0.666667,0.0,1.0,0.0,1,1.0,1.0,1.0,3,1.0
1,0.638889,0.0,1.0,0.0,1,1.0,0.0,0.0,5,5.0
2,0.805556,0.0,1.0,0.0,2,3.0,1.0,0.0,5,5.0
3,0.527778,0.0,1.0,1.0,1,1.0,0.0,0.0,5,5.0
4,0.819444,1.0,0.0,0.0,2,2.0,0.0,0.0,5,5.0
...,...,...,...,...,...,...,...,...,...,...
646,0.625000,1.0,0.0,1.0,2,3.0,1.0,1.0,3,4.0
647,0.861111,1.0,1.0,1.0,2,1.0,0.0,0.0,4,5.0
648,0.569444,0.0,0.0,0.0,3,4.0,1.0,1.0,3,3.0
649,1.000000,1.0,0.0,0.0,1,1.0,0.0,0.0,5,5.0


In [5]:
# copy the data
df_min_max_scaled = dataset.copy()
# apply normalization techniques
column = 'Age'
df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  


# apply normalization techniques 
column = 'CT grade Fischer'
df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  


column = 'WFNS grade '
df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  
# view normalized data
display(df_min_max_scaled)

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
0,0.666667,0.0,1.0,0.0,0.000000,0.25,1.0,1.0,3,1.0
1,0.638889,0.0,1.0,0.0,0.000000,0.25,0.0,0.0,5,5.0
2,0.805556,0.0,1.0,0.0,0.333333,0.75,1.0,0.0,5,5.0
3,0.527778,0.0,1.0,1.0,0.000000,0.25,0.0,0.0,5,5.0
4,0.819444,1.0,0.0,0.0,0.333333,0.50,0.0,0.0,5,5.0
...,...,...,...,...,...,...,...,...,...,...
646,0.625000,1.0,0.0,1.0,0.333333,0.75,1.0,1.0,3,4.0
647,0.861111,1.0,1.0,1.0,0.333333,0.25,0.0,0.0,4,5.0
648,0.569444,0.0,0.0,0.0,0.666667,1.00,1.0,1.0,3,3.0
649,1.000000,1.0,0.0,0.0,0.000000,0.25,0.0,0.0,5,5.0


#### Checking the missing values in the data

In [6]:
checking = df_min_max_scaled.describe()
checking

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
count,651.0,651.0,640.0,651.0,651.0,636.0,650.0,651.0,651.0,641.0
mean,0.597628,0.50384,0.642188,0.420891,0.276498,0.507862,0.347692,0.262673,3.894009,3.971919
std,0.167725,0.50037,0.479732,0.494082,0.273025,0.264607,0.476604,0.440425,1.221403,1.259647
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,0.5,0.0,0.0,0.0,0.0,0.25,0.0,0.0,3.0,3.0
50%,0.625,1.0,1.0,0.0,0.333333,0.5,0.0,0.0,4.0,4.0
75%,0.722222,1.0,1.0,1.0,0.333333,0.75,1.0,1.0,5.0,5.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0


#### Removing the columns with the missing data( inbuilt function of "dropna" is used )

In [7]:
print('Original dataset:')
print(dataset)
print('\n')

# Default configuration drops rows having at least 1 missing value
print('dataset after dropping the rows having missing values:')
print(dataset.dropna())

Original dataset:
          Age  Sex  Hypertension  Diabetes  WFNS grade   CT grade Fischer  \
0    0.666667  0.0           1.0       0.0            1               1.0   
1    0.638889  0.0           1.0       0.0            1               1.0   
2    0.805556  0.0           1.0       0.0            2               3.0   
3    0.527778  0.0           1.0       1.0            1               1.0   
4    0.819444  1.0           0.0       0.0            2               2.0   
..        ...  ...           ...       ...          ...               ...   
646  0.625000  1.0           0.0       1.0            2               3.0   
647  0.861111  1.0           1.0       1.0            2               1.0   
648  0.569444  0.0           0.0       0.0            3               4.0   
649  1.000000  1.0           0.0       0.0            1               1.0   
650  0.736111  1.0           0.0       0.0            3               3.0   

     Radiological Vasospasm  Ischemic deficits  Outcome a

#### We observe that only 615 rows have complete data

In [9]:
data_copy = dataset.copy()
data_copy.dropna(inplace = True)

In [10]:
data_copy

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
0,0.666667,0.0,1.0,0.0,1,1.0,1.0,1.0,3,1.0
1,0.638889,0.0,1.0,0.0,1,1.0,0.0,0.0,5,5.0
2,0.805556,0.0,1.0,0.0,2,3.0,1.0,0.0,5,5.0
3,0.527778,0.0,1.0,1.0,1,1.0,0.0,0.0,5,5.0
4,0.819444,1.0,0.0,0.0,2,2.0,0.0,0.0,5,5.0
...,...,...,...,...,...,...,...,...,...,...
646,0.625000,1.0,0.0,1.0,2,3.0,1.0,1.0,3,4.0
647,0.861111,1.0,1.0,1.0,2,1.0,0.0,0.0,4,5.0
648,0.569444,0.0,0.0,0.0,3,4.0,1.0,1.0,3,3.0
649,1.000000,1.0,0.0,0.0,1,1.0,0.0,0.0,5,5.0


In [13]:
data_copy.describe()

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits,Outcome at Discharge,Outcome at 3 months
count,615.0,615.0,615.0,615.0,615.0,615.0,615.0,615.0,615.0,615.0
mean,0.598306,0.492683,0.653659,0.398374,1.837398,2.058537,0.35122,0.269919,3.871545,3.96748
std,0.167412,0.500353,0.476191,0.489962,0.830741,1.061537,0.47774,0.444279,1.223316,1.262946
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
25%,0.5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,3.0
50%,0.625,0.0,1.0,0.0,2.0,2.0,0.0,0.0,4.0,4.0
75%,0.722222,1.0,1.0,1.0,2.0,3.0,1.0,1.0,5.0,5.0
max,1.0,1.0,1.0,1.0,4.0,4.0,1.0,1.0,5.0,5.0


#### Now defining the train dataset

In [14]:
df_train=data_copy.drop(["Outcome at Discharge", "Outcome at 3 months"], axis=1)
df_train_y=data_copy["Outcome at 3 months"]
X=df_train.values
Y=df_train_y.values

In [15]:
Y=Y-1

In [16]:
Y[:5]

array([0., 4., 4., 4., 4.])

#### dividing the train dataset into train and test

In [17]:
# from sklearn.model_selection import train_test_split
# X_train, X_test,Y_train, Y_test = train_test_split(X, Y , test_size=0.1, random_state=1)

In [18]:
# Y_test= Y_test.astype(int)
Y= Y.astype(int)

In [19]:
Y

array([0, 4, 4, 4, 4, 4, 4, 3, 2, 4, 2, 2, 0, 3, 3, 4, 4, 4, 4, 4, 4, 0,
       4, 3, 2, 2, 1, 4, 2, 3, 4, 2, 0, 4, 4, 2, 4, 4, 2, 2, 1, 4, 4, 4,
       4, 3, 3, 4, 3, 3, 3, 4, 3, 1, 4, 2, 2, 4, 4, 4, 3, 4, 3, 2, 4, 3,
       4, 4, 1, 3, 2, 4, 4, 3, 2, 4, 4, 2, 1, 4, 0, 4, 4, 4, 4, 4, 4, 3,
       0, 4, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 3, 2, 2, 3, 1, 4, 2, 3, 4, 2,
       4, 4, 4, 4, 4, 2, 2, 1, 4, 4, 1, 4, 4, 3, 3, 4, 4, 3, 3, 3, 4, 3,
       1, 1, 2, 1, 4, 1, 2, 3, 4, 2, 4, 3, 4, 4, 0, 3, 3, 0, 2, 4, 4, 4,
       3, 2, 4, 4, 2, 1, 4, 2, 2, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 1, 4,
       2, 4, 4, 4, 4, 4, 2, 4, 2, 1, 4, 4, 4, 1, 4, 4, 3, 4, 4, 3, 3, 3,
       4, 3, 1, 2, 2, 1, 1, 2, 3, 4, 0, 2, 4, 4, 4, 3, 0, 4, 4, 4, 3, 0,
       4, 0, 4, 4, 4, 4, 4, 3, 0, 4, 2, 2, 3, 3, 4, 4, 0, 2, 4, 4, 4, 0,
       4, 1, 4, 0, 4, 4, 4, 4, 4, 3, 0, 2, 3, 3, 3, 0, 4, 1, 4, 4, 0, 2,
       1, 4, 0, 4, 4, 4, 4, 4, 4, 3, 4, 4, 2, 0, 3, 3, 4, 4, 4, 4, 2, 4,
       0, 4, 4, 4, 4, 3, 0, 4, 2, 2, 3, 3, 4, 4, 3,

In [20]:
c_y_train=[0]*5
for i in range(len(list(Y))):
    c_y_train[Y[i]]+=1   
c_y_train

[44, 46, 98, 125, 302]

In [25]:
# c_y_test=[0]*5
# for i in range(len(list(Y_test))):
#     c_y_test[Y_test[i]]+=1   
# c_y_test

In [21]:
class_weight = {}
for i in range(len(c_y_train)):
    class_weight[i]= (sum(c_y_train) - c_y_train[i]) / sum(c_y_train)
class_weight

{0: 0.9284552845528455,
 1: 0.9252032520325203,
 2: 0.8406504065040651,
 3: 0.7967479674796748,
 4: 0.5089430894308943}

In [23]:
# X_train[1]

#### Now applying the Logistic Regression model

In [24]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(class_weight=class_weight)
classifier_LR.fit(X,Y)
# Y_pred_LR = classifier_LR.predict(X_test)
# print(confusion_matrix(Y_test, Y_pred_LR))
# print(classification_report(Y_test, Y_pred_LR))
# print("Accuracy: ",accuracy_score(Y_test, Y_pred_LR)*100 ,"%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight={0: 0.9284552845528455, 1: 0.9252032520325203,
                                 2: 0.8406504065040651, 3: 0.7967479674796748,
                                 4: 0.5089430894308943})

### TEST

In [25]:
dataset_test = pd.read_csv('test_sah1.csv', usecols=['Age', "Sex","Hypertension", "Diabetes","WFNS grade ","CT grade Fischer","Radiological Vasospasm", "Ischemic deficits"] )
dataset_test.head()  # To visualize the data
dataset_test.describe

<bound method NDFrame.describe of      Age Sex Hypertension Diabetes  WFNS grade   CT grade Fischer  \
0     62   F            Y        Y            2                 2   
1     55   F            N        N            1                 3   
2     23   F            Y        N            3                 4   
3     54   F            Y        Y            2                 3   
4     29   M            Y        N            1                 2   
..   ...  ..          ...      ...          ...               ...   
135   62   M            Y        N            2                 1   
136   55   M            Y        N            2                 3   
137   40   M            Y        Y            2                 0   
138   55   F            Y        N            1                 1   
139   43   M            Y        N            1                 1   

    Radiological Vasospasm Ischemic deficits  
0                        N                 N  
1                        N                 

In [26]:
dataset_test["Hypertension"] = dataset_test["Hypertension"].map({'Y':1 ,'N':0})
dataset_test["Diabetes"] = dataset_test["Diabetes"].map({'Y':1. ,'N':0.})
dataset_test["Radiological Vasospasm"] = dataset_test["Radiological Vasospasm"].map({'Y':1. ,'N':0.})
dataset_test["Ischemic deficits"] = dataset_test["Ischemic deficits"].map({'Y':1. ,'N':0.})
dataset_test["Sex"] = dataset_test["Sex"].map({'F':1. ,'M':0.})
dataset_test.head() #Checking again

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits
0,62,1.0,1,1.0,2,2,0.0,0.0
1,55,1.0,0,0.0,1,3,0.0,0.0
2,23,1.0,1,0.0,3,4,1.0,1.0
3,54,1.0,1,1.0,2,3,1.0,1.0
4,29,0.0,1,0.0,1,2,1.0,0.0


In [27]:
lst=dataset_test.columns
for i in lst:
    print(i, ":", dataset_test[i].unique())

Age : [62 55 23 54 29 45 43 56 49 70 42 60 46 32 69 41 72 52 65 67 36 47 50 59
 26 71 53 44 63 33 35 38 66 48 51 58 24 39 40 34 57 64 76 31 21 37 61 68
 17]
Sex : [1. 0.]
Hypertension : [1 0]
Diabetes : [1. 0.]
WFNS grade  : [2 1 3 4]
CT grade Fischer : [2 3 4 1 0]
Radiological Vasospasm : [0. 1.]
Ischemic deficits : [0. 1.]


In [28]:
df_test = dataset_test.copy()
  
# apply normalization techniques by Column 1
column = 'CT grade Fischer'
df_test[column] = (df_test[column] - df_test[column].min()) / (df_test[column].max() - df_test[column].min())    
  
# view normalized data
display(df_test)


Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits
0,62,1.0,1,1.0,2,0.50,0.0,0.0
1,55,1.0,0,0.0,1,0.75,0.0,0.0
2,23,1.0,1,0.0,3,1.00,1.0,1.0
3,54,1.0,1,1.0,2,0.75,1.0,1.0
4,29,0.0,1,0.0,1,0.50,1.0,0.0
...,...,...,...,...,...,...,...,...
135,62,0.0,1,0.0,2,0.25,0.0,0.0
136,55,0.0,1,0.0,2,0.75,0.0,0.0
137,40,0.0,1,1.0,2,0.00,0.0,0.0
138,55,1.0,1,0.0,1,0.25,0.0,0.0


In [30]:
column = 'WFNS grade '
df_test[column] = (df_test[column] - df_test[column].min()) / (df_test[column].max() - df_test[column].min())    


column = 'Age'
df_test[column] = (df_test[column] - df_test[column].min()) / (df_test[column].max() - df_test[column].min())    
  
# view normalized data
display(df_test)

Unnamed: 0,Age,Sex,Hypertension,Diabetes,WFNS grade,CT grade Fischer,Radiological Vasospasm,Ischemic deficits
0,0.762712,1.0,1,1.0,0.333333,0.50,0.0,0.0
1,0.644068,1.0,0,0.0,0.000000,0.75,0.0,0.0
2,0.101695,1.0,1,0.0,0.666667,1.00,1.0,1.0
3,0.627119,1.0,1,1.0,0.333333,0.75,1.0,1.0
4,0.203390,0.0,1,0.0,0.000000,0.50,1.0,0.0
...,...,...,...,...,...,...,...,...
135,0.762712,0.0,1,0.0,0.333333,0.25,0.0,0.0
136,0.644068,0.0,1,0.0,0.333333,0.75,0.0,0.0
137,0.389831,0.0,1,1.0,0.333333,0.00,0.0,0.0
138,0.644068,1.0,1,0.0,0.000000,0.25,0.0,0.0


In [31]:
P = df_test.values

In [32]:
y_predict= classifier_LR.predict(P)

In [33]:
y_predict

array([4, 4, 0, 2, 1, 4, 4, 1, 4, 2, 4, 0, 3, 2, 3, 4, 4, 0, 4, 4, 2, 4,
       2, 4, 4, 0, 4, 4, 0, 4, 4, 0, 4, 4, 0, 4, 2, 4, 2, 4, 2, 0, 1, 2,
       4, 4, 0, 4, 1, 1, 1, 4, 0, 2, 1, 2, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4,
       4, 2, 4, 2, 2, 4, 4, 4, 1, 4, 4, 2, 1, 4, 4, 4, 4, 4, 2, 1, 4, 4,
       4, 2, 3, 4, 4, 2, 1, 4, 4, 2, 2, 1, 2, 4, 4, 4, 0, 2, 4, 4, 4, 1,
       1, 4, 4, 4, 4, 1, 1, 0, 4, 4, 2, 0, 2, 1, 4, 3, 1, 2, 2, 3, 4, 2,
       4, 4, 4, 4, 4, 4, 4, 4])

In [39]:
y_predict = y_predict.reshape(y_predict.shape[0],1)
print(y_predict.shape)

(140, 1)


In [40]:
df_test["5 class classification_LR"]= list(y_predict)
df_test.to_csv("./Solution_1.csv")

#### SVM

In [37]:
from sklearn.svm import SVC
classifier_SVC = SVC(gamma = 0.01, C = 100, class_weight = class_weight)#, probability=True)
classifier_SVC.fit(X,Y)
Y_pred_SVC = classifier_SVC.predict(P)
#print(classification_report(Y_test,Y_pred_SVC))
#print("Accuracy: ",accuracy_score(Y_test, Y_pred_SVC)*100, " %")
# submission['Survived'] = Y_pred_SVC
# submission.to_csv('submission.csv',index=False)

In [38]:
Y_pred_SVC

array([4, 4, 0, 2, 1, 4, 4, 1, 4, 2, 4, 0, 1, 2, 1, 4, 4, 0, 4, 4, 2, 4,
       2, 4, 4, 0, 4, 2, 0, 4, 4, 0, 4, 4, 0, 4, 2, 4, 2, 4, 2, 0, 1, 2,
       4, 4, 0, 4, 1, 1, 1, 4, 0, 0, 1, 2, 4, 4, 4, 2, 0, 4, 4, 4, 4, 4,
       4, 2, 4, 2, 2, 4, 4, 4, 1, 4, 4, 2, 1, 4, 4, 4, 4, 4, 2, 1, 4, 4,
       4, 2, 1, 4, 4, 2, 1, 4, 4, 0, 2, 1, 2, 4, 4, 4, 0, 2, 4, 4, 4, 1,
       1, 4, 4, 4, 4, 1, 1, 0, 4, 4, 2, 0, 2, 1, 4, 1, 1, 2, 2, 1, 4, 2,
       4, 4, 4, 4, 4, 4, 4, 4])

In [44]:
df_test["5 class classification_SVM"]= list(Y_pred_SVC)
df_test.to_csv("./Solution_1.csv")

#### KNN

In [45]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
classifier_KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_KNN.fit(X,Y)
Y_pred_KNN = classifier_KNN.predict(P)

In [46]:
Y_pred_KNN

array([4, 4, 0, 0, 2, 3, 4, 4, 4, 0, 4, 0, 1, 1, 2, 4, 4, 0, 4, 4, 0, 4,
       4, 4, 4, 0, 3, 4, 0, 4, 4, 0, 4, 4, 0, 4, 2, 4, 2, 4, 0, 0, 2, 0,
       4, 3, 0, 4, 1, 4, 2, 4, 0, 0, 2, 3, 4, 4, 4, 4, 0, 3, 3, 4, 4, 4,
       4, 0, 4, 0, 0, 4, 4, 3, 2, 4, 4, 2, 1, 4, 4, 3, 4, 4, 3, 1, 3, 4,
       3, 2, 2, 4, 4, 2, 4, 4, 3, 0, 0, 1, 0, 4, 4, 3, 0, 0, 3, 4, 4, 1,
       2, 4, 4, 4, 3, 1, 2, 0, 3, 3, 2, 0, 0, 1, 4, 1, 2, 0, 2, 2, 4, 0,
       4, 3, 3, 4, 4, 3, 4, 4])

In [47]:
df_test["5 class classification_KNN"]= list(Y_pred_KNN)
df_test.to_csv("./Solution_1.csv")