In [5]:
#importing library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings(action='ignore')

In [6]:
#loading the dataset
df=pd.read_csv('../input/memory-test-on-drugged-islanders-data/Islander_data.csv')
#showing the dataset
df

Unnamed: 0,first_name,last_name,age,Happy_Sad_group,Dosage,Drug,Mem_Score_Before,Mem_Score_After,Diff
0,Bastian,Carrasco,25,H,1,A,63.5,61.2,-2.3
1,Evan,Carrasco,52,S,1,A,41.6,40.7,-0.9
2,Florencia,Carrasco,29,H,1,A,59.7,55.1,-4.6
3,Holly,Carrasco,50,S,1,A,51.7,51.2,-0.5
4,Justin,Carrasco,52,H,1,A,47.0,47.1,0.1
...,...,...,...,...,...,...,...,...,...
193,Jacob,Novak,52,H,3,T,71.3,74.3,3.0
194,Teo,Steiner,41,S,3,T,72.5,70.4,-2.1
195,Alexander,Takahashi,54,S,3,T,30.8,33.1,2.3
196,Alexandere,Takahashi,40,H,3,T,53.6,53.8,0.2


In [7]:
#getting informatin about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   first_name        198 non-null    object 
 1   last_name         198 non-null    object 
 2   age               198 non-null    int64  
 3   Happy_Sad_group   198 non-null    object 
 4   Dosage            198 non-null    int64  
 5   Drug              198 non-null    object 
 6   Mem_Score_Before  198 non-null    float64
 7   Mem_Score_After   198 non-null    float64
 8   Diff              198 non-null    float64
dtypes: float64(3), int64(2), object(4)
memory usage: 14.0+ KB


In [18]:
#creating one_hot encode 
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column],prefix=column)
    if len(df[column].unique())==2:
        dummies=dummies.drop(dummies.columns[0],axis=1)
        
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [11]:
#getting length of unique value in each column
{column:len(x[column].unique()) for column in x.columns}

{'first_name': 139,
 'last_name': 18,
 'age': 45,
 'Happy_Sad_group': 2,
 'Dosage': 3,
 'Drug': 3,
 'Mem_Score_Before': 162,
 'Mem_Score_After': 151,
 'Diff': 142}

In [22]:
#preprocessing 
def preprocess_inputs(df):
    df=df.copy()
    #One hot encode categorical features
    for column in ['first_name','last_name','Happy_Sad_group']:
        df=onehot_encode(df,column)
    #Split df into x and y
    y=df['Drug']
    x=df.drop('Drug',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)
    #scaling x
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.transform(x_test),columns=x.columns,index=x_test.index)
    
    return x_train,x_test,y_train,y_test

In [23]:
#calling the function
x_train,x_test,y_train,y_test=preprocess_inputs(df)
x_train

Unnamed: 0,age,Dosage,Mem_Score_Before,Mem_Score_After,Diff,first_name_Aaron,first_name_Adam,first_name_Ai,first_name_Akane,first_name_Akira,...,last_name_Lopez,last_name_McCarthy,last_name_Morin,last_name_Novak,last_name_Price,last_name_Rodriguez,last_name_Steiner,last_name_Summers,last_name_Takahashi,Happy_Sad_group_S
124,-0.302247,1.206716,0.249183,-0.151850,-0.594735,0.0,-0.121268,0.0,-0.121268,0.0,...,2.761340,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
97,0.909251,0.025675,1.221038,1.038471,-0.092208,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
42,1.428464,0.025675,0.438505,-0.684661,-1.707473,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,3.785939,-0.336011,1.0
17,-1.167603,-1.155366,0.413262,0.698379,0.518003,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,11.704700,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,-1.0
5,-0.215712,-1.155366,0.564720,-0.117841,-0.989577,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,-0.215712,-1.155366,-0.665875,-0.752679,-0.244761,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
137,1.947677,-1.155366,2.432701,2.370498,0.293661,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,-1.0
72,1.082322,-1.155366,0.552099,0.069210,-0.675498,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
140,-0.475319,-1.155366,-0.533349,-1.115444,-1.007525,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0


In [28]:
#storing models into a dictionary

models={"LogisticRegression":LogisticRegression(),
":DecisionTreeClassifier":DecisionTreeClassifier(),
":LinearSVC":LinearSVC(),
":SVC":SVC(),
":MLPClassifier":MLPClassifier(),
":RandomForestClassifier":RandomForestClassifier(),
":GradientBoostingClassifier":GradientBoostingClassifier(),
"XGBClassifier(eval_metrics=":XGBClassifier(eval_metrics='mlogloss'),
"LGBMClassifier":LGBMClassifier(),
":CatBoostClassifier":CatBoostClassifier(verbose=0)}

In [29]:
#traning the models
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+"trained")
    

LogisticRegressiontrained
:DecisionTreeClassifiertrained
:LinearSVCtrained
:SVCtrained
:MLPClassifiertrained
:RandomForestClassifiertrained
:GradientBoostingClassifiertrained
Parameters: { "eval_metrics" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(eval_metrics=trained
LGBMClassifiertrained
:CatBoostClassifiertrained


In [26]:
y_train

124    S
97     S
42     A
17     A
5      A
      ..
133    T
137    T
72     S
140    T
37     A
Name: Drug, Length: 138, dtype: object

In [30]:
#Getting the Results
for name,model in models.items():
    print(name,':{:2f}%'.format(model.score(x_test,y_test)*100))

LogisticRegression :41.666667%
:DecisionTreeClassifier :51.666667%
:LinearSVC :40.000000%
:SVC :43.333333%
:MLPClassifier :36.666667%
:RandomForestClassifier :46.666667%
:GradientBoostingClassifier :50.000000%
XGBClassifier(eval_metrics= :38.333333%
LGBMClassifier :38.333333%
:CatBoostClassifier :48.333333%
