In [19]:
#importing the library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings(action='ignore')



models={'Logistic Regression':LogisticRegression(),
       'K Nearest Neighbors':KNeighborsClassifier(),
       'Decision Tree':DecisionTreeClassifier(),
       'Support Vector Machine':SVC(),
       'Support Vector Machine(Linear)':SVC(),
       'Neural Network': MLPClassifier(),
        'Random Forest':RandomForestClassifier(),
       'XG Boost':XGBClassifier(),
       'Light Gbm': LGBMClassifier(),
       'Cat Boost':CatBoostClassifier()}

# Loading the Dataset

In [20]:
df=pd.read_csv('/kaggle/input/memory-test-on-drugged-islanders-data/Islander_data.csv')
#showing the dataset
df

Unnamed: 0,first_name,last_name,age,Happy_Sad_group,Dosage,Drug,Mem_Score_Before,Mem_Score_After,Diff
0,Bastian,Carrasco,25,H,1,A,63.5,61.2,-2.3
1,Evan,Carrasco,52,S,1,A,41.6,40.7,-0.9
2,Florencia,Carrasco,29,H,1,A,59.7,55.1,-4.6
3,Holly,Carrasco,50,S,1,A,51.7,51.2,-0.5
4,Justin,Carrasco,52,H,1,A,47.0,47.1,0.1
...,...,...,...,...,...,...,...,...,...
193,Jacob,Novak,52,H,3,T,71.3,74.3,3.0
194,Teo,Steiner,41,S,3,T,72.5,70.4,-2.1
195,Alexander,Takahashi,54,S,3,T,30.8,33.1,2.3
196,Alexandere,Takahashi,40,H,3,T,53.6,53.8,0.2


# Checking for Missing Values

In [21]:
df.isna().sum()

first_name          0
last_name           0
age                 0
Happy_Sad_group     0
Dosage              0
Drug                0
Mem_Score_Before    0
Mem_Score_After     0
Diff                0
dtype: int64

# Getting the Statistical Information

In [22]:
df.describe()

Unnamed: 0,age,Dosage,Mem_Score_Before,Mem_Score_After,Diff
count,198.0,198.0,198.0,198.0,198.0
mean,39.530303,1.989899,57.967677,60.922222,2.954545
std,12.023099,0.818504,15.766007,18.133851,10.754603
min,24.0,1.0,27.2,27.1,-40.4
25%,30.0,1.0,46.525,47.175,-3.175
50%,37.0,2.0,54.8,56.75,1.7
75%,48.0,3.0,68.4,73.25,5.925
max,83.0,3.0,110.0,120.0,49.0


# Preprocessing Function

In [23]:
{column:len(df[column].unique()) for column in df.columns}

{'first_name': 139,
 'last_name': 18,
 'age': 45,
 'Happy_Sad_group': 2,
 'Dosage': 3,
 'Drug': 3,
 'Mem_Score_Before': 162,
 'Mem_Score_After': 151,
 'Diff': 142}

In [24]:
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column],prefix=column)
    if len(df[column].unique())==2:
        dummies=dummies.drop(dummies.columns[0],axis=1)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [30]:
def preprocess_inputs(df):
    df=df.copy()
    
    
    onehot_columns=['first_name','last_name','Happy_Sad_group']
    
    for column in onehot_columns:
        df=onehot_encode(df,column)
    
    
    #splitting between target and features
    
    y=df['Drug']
    y=y.replace({'A':0, 'S':1,'T':2})
    x=df.drop('Drug',axis=1)
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    
    scaler=StandardScaler()
    
    scaler.fit(x_train)
    
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns)
    
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)
    
    
    return x_train,x_test,y_train,y_test

In [31]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
x_train

Unnamed: 0,age,Dosage,Mem_Score_Before,Mem_Score_After,Diff,first_name_Aaron,first_name_Adam,first_name_Ai,first_name_Akane,first_name_Akira,...,last_name_Lopez,last_name_McCarthy,last_name_Morin,last_name_Novak,last_name_Price,last_name_Rodriguez,last_name_Steiner,last_name_Summers,last_name_Takahashi,Happy_Sad_group_S
0,-0.946166,-0.026491,0.154939,0.154143,0.035695,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,2.976095,-1.029418
1,-0.946166,-0.026491,-0.894387,-0.813238,-0.069315,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,-0.336011,-1.029418
2,-1.206787,-1.245060,0.068524,-0.027575,-0.155233,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,3.100868,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,-0.336011,-1.029418
3,0.965052,-0.026491,1.136367,0.881014,-0.183872,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,-0.336011,0.971423
4,0.096316,-1.245060,-0.091961,-0.150502,-0.126593,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,3.785939,-0.248069,-0.336011,-1.029418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,-0.946166,-1.245060,0.963537,1.495648,1.181262,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,3.785939,-0.248069,-0.336011,0.971423
134,-0.164304,-1.245060,0.494427,-0.209293,-1.138511,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,-0.336011,0.971423
135,1.138799,-1.245060,0.482082,-0.032920,-0.804387,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,-0.336011,0.971423
136,-0.251178,1.192079,0.210491,1.356687,2.097716,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.322490,-0.248069,0.0,-0.085436,-0.121268,-0.085436,-0.264135,-0.248069,-0.336011,-1.029418


# Training the Model

In [32]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print(model.score(x_test,y_test))

Logistic Regression
0.4166666666666667
K Nearest Neighbors
0.38333333333333336
Decision Tree
0.36666666666666664
Support Vector Machine
0.38333333333333336
Support Vector Machine(Linear)
0.38333333333333336
Neural Network
0.36666666666666664
Random Forest
0.31666666666666665
XG Boost
0.35
Light Gbm
0.3333333333333333
Learning rate set to 0.071525
0:	learn: 1.0777837	total: 57.2ms	remaining: 57.1s
1:	learn: 1.0622449	total: 58.8ms	remaining: 29.3s
2:	learn: 1.0490117	total: 60.5ms	remaining: 20.1s
3:	learn: 1.0316727	total: 62ms	remaining: 15.4s
4:	learn: 1.0147616	total: 63.5ms	remaining: 12.6s
5:	learn: 1.0026400	total: 64.9ms	remaining: 10.7s
6:	learn: 0.9934489	total: 65.8ms	remaining: 9.33s
7:	learn: 0.9847611	total: 67.1ms	remaining: 8.32s
8:	learn: 0.9736933	total: 68.5ms	remaining: 7.54s
9:	learn: 0.9589419	total: 69.9ms	remaining: 6.92s
10:	learn: 0.9494421	total: 71.4ms	remaining: 6.42s
11:	learn: 0.9392697	total: 72.8ms	remaining: 5.99s
12:	learn: 0.9309446	total: 74.3ms	rema