In [145]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

In [146]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,1,67,0,3,0,37.205177,0,12.215677,7.780544,6.43389,...,1,0,6.009376,0,0,0,1,1,0,XXXConfid
1,2,65,1,0,0,35.141843,1,17.111404,6.645284,1.112379,...,0,0,7.519209,0,0,0,0,1,0,XXXConfid
2,3,62,0,1,1,17.875103,0,13.525546,9.585769,4.266008,...,0,0,8.573933,0,0,0,0,0,0,XXXConfid
3,4,67,0,0,1,37.503437,1,19.952014,1.953946,6.797333,...,0,0,6.21753,0,0,0,0,1,0,XXXConfid
4,5,65,1,0,2,29.187863,1,0.533209,8.75957,6.364302,...,1,0,5.193683,1,0,0,0,1,0,XXXConfid


In [147]:
df_train.drop(['DoctorInCharge'],axis = 1, inplace = True)
df_train.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,1,67,0,3,0,37.205177,0,12.215677,7.780544,6.43389,...,9.986441,1,0,6.009376,0,0,0,1,1,0
1,2,65,1,0,0,35.141843,1,17.111404,6.645284,1.112379,...,6.197277,0,0,7.519209,0,0,0,0,1,0
2,3,62,0,1,1,17.875103,0,13.525546,9.585769,4.266008,...,9.572719,0,0,8.573933,0,0,0,0,0,0
3,4,67,0,0,1,37.503437,1,19.952014,1.953946,6.797333,...,2.487042,0,0,6.21753,0,0,0,0,1,0
4,5,65,1,0,2,29.187863,1,0.533209,8.75957,6.364302,...,7.521358,1,0,5.193683,1,0,0,0,1,0


In [148]:
x_train = df_train.drop(['PatientID','Diagnosis'], axis = 1)
y_train = df_train['Diagnosis']

In [149]:
numerical_column = []
for col in df_train.drop(['PatientID'], axis = 1).columns:
    if df_train[col].nunique() > 4:
        numerical_column.append(col)
print(f"Numerical columns:", numerical_column)

Numerical columns: ['Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL']


In [150]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,DoctorInCharge
0,1505,80,0,0,0,30.680705,0,13.278229,4.251053,5.654801,...,0.197431,0,0,4.469644,0,1,0,0,0,XXXConfid
1,1506,89,0,3,1,39.463034,0,9.811292,8.81995,0.43402,...,8.246968,0,0,2.01894,0,1,0,0,1,XXXConfid
2,1507,71,1,1,2,27.241423,0,0.244764,1.943318,4.353874,...,4.046998,0,0,9.940631,0,0,0,0,0,XXXConfid
3,1508,76,0,0,2,18.530132,0,9.621769,1.753789,6.938617,...,5.480984,0,0,3.219415,0,0,0,1,1,XXXConfid
4,1509,65,0,0,2,37.706252,1,5.207424,1.716515,3.480592,...,1.826698,1,0,0.069671,0,1,1,0,1,XXXConfid


In [151]:
df_test.drop(['DoctorInCharge'], axis = 1, inplace = True)
df_test

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
0,1505,80,0,0,0,30.680705,0,13.278229,4.251053,5.654801,...,4.770631,0.197431,0,0,4.469644,0,1,0,0,0
1,1506,89,0,3,1,39.463034,0,9.811292,8.819950,0.434020,...,19.696107,8.246968,0,0,2.018940,0,1,0,0,1
2,1507,71,1,1,2,27.241423,0,0.244764,1.943318,4.353874,...,22.295973,4.046998,0,0,9.940631,0,0,0,0,0
3,1508,76,0,0,2,18.530132,0,9.621769,1.753789,6.938617,...,1.929911,5.480984,0,0,3.219415,0,0,0,1,1
4,1509,65,0,0,2,37.706252,1,5.207424,1.716515,3.480592,...,21.915696,1.826698,1,0,0.069671,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,2145,66,1,1,1,36.221206,1,16.733257,2.221013,1.659351,...,12.225805,6.902869,0,1,7.989620,0,0,0,0,0
641,2146,73,0,0,2,20.079092,0,4.304789,4.021262,3.446462,...,9.885252,8.156956,0,1,9.612101,0,0,0,0,0
642,2147,68,1,0,1,25.355315,0,0.842862,5.769844,0.380685,...,5.505361,9.924721,0,0,6.004801,0,0,0,1,0
643,2148,81,1,0,0,25.853357,0,3.809247,5.197996,9.809091,...,14.843621,5.021272,0,0,9.566590,0,1,0,0,0


In [152]:
x_test = df_test.drop(['PatientID'], axis = 1)

In [153]:
scaler = StandardScaler()
x_train[numerical_column]=scaler.fit_transform(df_train[numerical_column])
x_train

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
0,-0.883590,0,3,0,1.342871,0,0.380556,0.989761,0.509643,-0.169417,...,-1.632182,1.670289,1,0,0.377781,0,0,0,1,1
1,-1.107126,1,0,0,1.055932,1,1.233049,0.597720,-1.302530,0.300851,...,1.069152,0.364413,0,0,0.893577,0,0,0,0,1
2,-1.442429,0,1,1,-1.345284,0,0.608643,1.613163,-0.228602,0.688017,...,-0.942389,1.527706,0,0,1.253896,0,0,0,0,0
3,-0.883590,0,0,1,1.384349,1,1.727684,-1.022348,0.633409,0.356641,...,1.250752,-0.914262,0,0,0.448892,0,0,0,0,1
4,-1.107126,1,0,2,0.227936,1,-1.653711,1.327850,0.485946,-0.462604,...,-0.938217,0.820737,1,0,0.099121,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499,-1.554197,0,0,0,-0.593928,0,0.008433,-0.304845,1.323357,-0.878971,...,0.408622,-0.332323,0,0,-0.747542,0,0,1,0,0
1500,0.345856,1,3,2,0.934660,0,1.454058,-0.601658,-0.858187,1.331300,...,1.738848,0.852950,0,0,-0.979641,1,0,0,0,0
1501,0.792927,1,0,2,-1.675289,0,0.787722,1.008061,-1.353836,0.657377,...,1.485998,0.821186,0,0,-1.253846,0,0,1,0,0
1502,1.351767,1,2,0,-0.670885,0,0.849751,1.338119,-1.242263,1.123169,...,1.088078,-0.000734,0,0,1.312123,0,0,0,0,0


In [154]:
x_test[numerical_column] = scaler.fit_transform(df_test[numerical_column])
x_test

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
0,0.559694,0,0,0,0.381947,0,0.555809,-0.247074,0.187312,1.148669,...,-1.169606,-1.655642,0,0,-0.233511,0,1,0,0,0
1,1.550552,0,3,1,1.590395,0,-0.043136,1.406637,-1.652693,0.319249,...,0.536567,1.154151,0,0,-1.052777,0,1,0,0,1
2,-0.431164,1,1,2,-0.091297,0,-1.695841,-1.082357,-0.271185,0.282072,...,0.833764,-0.311902,0,0,1.595431,0,0,0,0,0
3,0.119313,0,0,2,-1.289970,0,-0.075878,-1.150957,0.639778,0.156936,...,-1.494337,0.188649,0,0,-0.651460,0,0,0,1,1
4,-1.091736,0,0,2,1.348662,1,-0.838496,-1.164448,-0.578963,-0.123755,...,0.790294,-1.086926,1,0,-1.704415,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,-0.981641,1,1,1,1.144319,1,1.152697,-0.981845,-1.220839,0.783012,...,-0.317384,0.684976,0,1,0.943212,0,0,0,0,0
641,-0.210973,0,0,2,-1.076833,0,-0.994435,-0.330246,-0.590992,0.416408,...,-0.584939,1.122731,0,1,1.485604,0,0,0,0,0
642,-0.761450,1,0,1,-0.350826,0,-1.592514,0.302652,-1.671490,-0.763469,...,-1.085617,1.739791,0,0,0.279690,0,0,0,1,0
643,0.669789,1,0,0,-0.282295,0,-1.080044,0.095672,1.651444,-0.177675,...,-0.018135,0.028181,0,0,1.470390,0,1,0,0,0


In [155]:
NB_classifier = GaussianNB()
NB_classifier.fit(x_train, y_train)

In [176]:
val_scores_1 = cross_val_score(NB_classifier, x_train, y_train, cv = 10)
print(f"Cross-validation scores_1 :{val_scores_1}")
print(f"Mean accuracy : {val_scores_1.mean():.2f}")

Cross-validation scores_1 :[0.78807947 0.7615894  0.73509934 0.8013245  0.8        0.81333333
 0.76666667 0.8        0.83333333 0.82      ]
Mean accuracy : 0.79


In [157]:
pred = NB_classifier.predict(x_test)
pred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,

In [158]:
df_test['Diagnosis'] = pd.Series(pred)
df_test

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,1505,80,0,0,0,30.680705,0,13.278229,4.251053,5.654801,...,0.197431,0,0,4.469644,0,1,0,0,0,0
1,1506,89,0,3,1,39.463034,0,9.811292,8.819950,0.434020,...,8.246968,0,0,2.018940,0,1,0,0,1,0
2,1507,71,1,1,2,27.241423,0,0.244764,1.943318,4.353874,...,4.046998,0,0,9.940631,0,0,0,0,0,0
3,1508,76,0,0,2,18.530132,0,9.621769,1.753789,6.938617,...,5.480984,0,0,3.219415,0,0,0,1,1,0
4,1509,65,0,0,2,37.706252,1,5.207424,1.716515,3.480592,...,1.826698,1,0,0.069671,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,2145,66,1,1,1,36.221206,1,16.733257,2.221013,1.659351,...,6.902869,0,1,7.989620,0,0,0,0,0,1
641,2146,73,0,0,2,20.079092,0,4.304789,4.021262,3.446462,...,8.156956,0,1,9.612101,0,0,0,0,0,0
642,2147,68,1,0,1,25.355315,0,0.842862,5.769844,0.380685,...,9.924721,0,0,6.004801,0,0,0,1,0,0
643,2148,81,1,0,0,25.853357,0,3.809247,5.197996,9.809091,...,5.021272,0,0,9.566590,0,1,0,0,0,0
