In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
modelX_df = pd.read_csv('Dementia Prediction Dataset.csv')
original_df = modelX_df

forbidden_medical_variables = [
    'NPSYDEV', 'NPIQINF', 'NPIQINFX', 'NPFORMVER', 'NPSEX', 'NPPMIH', 'NPFIX', 'NPFIXX', 'NPWBRWT',
    'NPWBRF', 'NPGRCCA', 'NPGRLA', 'NPGRHA', 'NPGRSNH', 'NPGRLCH', 'NPTAN', 'NPTANX', 'NPABAN',
    'NPABANX', 'NPASAN', 'NPASANX', 'NPTDPAN', 'NPTDPANX', 'NPHISMB', 'NPHISG', 'NPHISSS',
    'NPHIST', 'NPHISO', 'NPHISOX', 'NPTHAL', 'NPADNC', 'NPLINF', 'NPLAC', 'NPINF', 'NPINF1A',
    'NPINF1B', 'NPINF1D', 'NPINF1F', 'NPINF2A', 'NPINF2B', 'NPINF2D', 'NPINF2F', 'NPINF3A',
    'NPINF3B', 'NPINF3D', 'NPINF3F', 'NPINF4A', 'NPINF4B', 'NPINF4D', 'NPINF4F', 'NPHEM',
    'NPHEMO', 'NPHEMO1', 'NPHEMO2', 'NPHEMO3', 'NPMICRO', 'NPOLD', 'NPOLD1', 'NPOLD2', 'NPOLD3',
    'NPOLD4', 'NPOLDD', 'NPOLDD1', 'NPOLDD2', 'NPOLDD3', 'NPOLDD4', 'NPWMR', 'NPPATH', 'NPPATH2',
    'NPPATH3', 'NPPATH4', 'NPPATH5', 'NPPATH6', 'NPPATH7', 'NPPATH8', 'NPPATH9', 'NPPATH10',
    'NPPATH11', 'NPPATHO', 'NPPATHOX', 'NPART', 'NPOANG', 'NPLBOD', 'NPNLOSS', 'NPHIPSCL', 'NPSCL',
    'NPFTDTAU', 'NPFTDT2', 'NPFTDT5', 'NPFTDT6', 'NPFTDT7', 'NPFTDT8', 'NPFTDT9', 'NPFTDT10',
    'NPFRONT', 'NPTAU', 'NPFTD', 'NPFTDTDP', 'NPALSMND', 'NPOFTD', 'NPOFTD1', 'NPOFTD2', 'NPOFTD3',
    'NPOFTD4', 'NPOFTD5', 'NPFTDNO', 'NPFTDSPC', 'NPTDPA', 'NPTDPB', 'NPTDPC', 'NPTDPD', 'NPTDPE',
    'NPPDXA', 'NPPDXB', 'NPPDXD', 'NPPDXE', 'NPPDXF', 'NPPDXG', 'NPPDXH', 'NPPDXI', 'NPPDXJ',
    'NPPDXK', 'NPPDXL', 'NPPDXM', 'NPPDXN', 'NPPDXP', 'NPPDXQ', 'NPBNKB', 'NPBNKF', 'NPFAUT',
    'NPFAUT1', 'NPFAUT2', 'NPFAUT3', 'NPFAUT4', 'NPNIT', 'NPCERAD', 'NPADRDA', 'NPOCRIT', 'NPVOTH',
    'NPLEWYCS', 'NPGENE', 'NPFHSPEC', 'NPTAUHap', 'NPPRNP', 'NPCHROM', 'NPPNORM', 'NPCNORM',
    'NPPADP', 'NPCADP', 'NPPAD', 'NPCAD', 'NPPLEWY', 'NPCLEWY', 'NPPVASC', 'NPCVASC', 'NPPFTLD',
    'NPCFTLD', 'NPPHIPP', 'NPCHIPP', 'NPPPRION', 'NPCPRION', 'NPPOTH1', 'NPCOTH1', 'NPOTH1X',
    'NPPOTH2', 'NPCOTH2', 'NPOTH2X', 'NPPOTH3', 'NPCOTH3', 'NPOTH3X', 'NPARTAG', 'NPATGSEV',
    'NPATGAMY', 'NPATGAM1', 'NPATGAM2', 'NPATGAM3', 'NPATGAM4', 'NPATGAM5', 'NPATGFRN', 'NPATGFR1',
    'NPATGFR2', 'NPATGFR3', 'NPATGFR4',
    'WHODIDDX', 'DXMETHOD', 'NORMCOG', 'DEMENTED', 'NACCUDSD', 'AMNDEM', 'PCA', 'NACCPPA',
    'NACCPPAG', 'NACCPPME', 'NACCBVFT', 'NACCLBDS', 'NAMNDEM', 'NACCTMCI', 'NACCMCIL',
    'NACCMCIA', 'NACCMCIE', 'NACCMCIV', 'IMPNOMCI', 'AMYLPET', 'AMYLCSF', 'FDGAD',
    'HIPPATR', 'TAUPETAD', 'CSFTAU', 'FDGFTLD', 'TPETFTLD', 'MRFTLD', 'DATSCAN',
    'OTHBIOM', 'OTHBIOMX', 'IMAGLINF', 'IMAGLAC', 'IMAGMACH', 'IMAGMICH', 'IMAGMWMH',
    'IMAGEWMH', 'OTHMUT', 'OTHMUTX', 'NACCALZD', 'NACCALZP', 'PROBAD', 'PROBADIF',
    'POSSAD', 'POSSADIF', 'NACCLBDE', 'NACCLBDP', 'PARK', 'MSA', 'MSAIF', 'PSP',
    'PSPIF', 'CORT', 'CORTIF', 'FTLDMO', 'FTLDMOIF', 'FTLDNOS', 'FTLDNOIF', 'FTD',
    'FTDIF', 'PPAPH', 'PPAPHIF', 'FTLDSUBT', 'FTLDSUBX', 'CVD', 'CVDIF', 'PREVSTK',
    'STROKDEC', 'STKIMAG', 'INFNETW', 'INFWMH', 'VASC', 'VASCIF', 'VASCPS', 'VASCPSIF',
    'STROKE', 'STROKIF', 'ESSTREM', 'ESSTREIF', 'DOWNS', 'DOWNSIF', 'HUNT', 'HUNTIF',
    'PRION', 'PRIONIF', 'BRNINJ', 'BRNINJIF', 'BRNINCTE', 'HYCEPH', 'HYCEPHIF', 'EPILEP',
    'EPILEPIF', 'NEOP', 'NEOPIF', 'NEOPSTAT', 'HIV', 'HIVIF', 'OTHCOG', 'OTHCOGIF',
    'OTHCOGX', 'DEP', 'DEPIF', 'DEPTREAT', 'BIPOLDX', 'BIPOLDIF', 'SCHIZOP', 'SCHIZOIF',
    'ANXIET', 'ANXIETIF', 'DELIR', 'DELIRIF', 'PTSDDX', 'PTSDDXIF', 'OTHPSY', 'OTHPSYIF',
    'OTHPSYX', 'ALCDEM', 'ALCDEMIF', 'ALCABUSE', 'IMPSUB', 'IMPSUBIF', 'DYSILL', 'DYSILLIF',
    'MEDS', 'MEDSIF', 'DEMUN', 'DEMUNIF', 'COGOTH', 'COGOTHIF', 'COGOTHX', 'COGOTH2',
    'COGOTH2F', 'COGOTH2X', 'COGOTH3', 'COGOTH3F', 'COGOTH3X', 'NACCNORM', 'NACCIDEM',
    'NACCMCII', 'NACCADMU', 'NACCFTDM', 'NACCETPR', 'CANCER', 'CANCSITE', 'DIABET',
    'MYOINF', 'CONGHRT', 'AFIBRILL', 'HYPERT', 'ANGINA', 'HYPCHOL', 'VB12DEF', 'THYDIS',
    'ARTH', 'ARTYPE', 'ARTYPEX', 'ARTUPEX', 'ARTHLOEX', 'ARTHSPIN', 'ARTUNKN', 'URINEINC',
    'BOWLINC', 'SLEEPAP', 'REMDIS', 'HYPOSOM', 'SLEEPOTH', 'SLEEPOTX', 'ANGIOCP',
    'ANGIOPCI', 'PACEMAKE', 'HVALVE', 'ANTIENC', 'ANTIENCX', 'OTHCOND', 'OTHCONDX',
    'MMSECOMP', 'MMSELOC', 'MMSELAN', 'MMSELANX', 'MMSEVIS', 'MMSEHEAR', 'MMSEORDA',
    'MMSEORLO', 'PENTAGON', 'NACCMMSE', 'NPSYCLOC', 'NPSYLAN', 'NPSYLANX', 'LOGIMO',
    'LOGIDAY', 'LOGIYR', 'LOGIPREV', 'LOGIMEM', 'MEMUNITS', 'MEMTIME', 'UDSBENTC',
    'UDSBENTD', 'UDSBENRS', 'DIGIF', 'DIGIFLEN', 'DIGIB', 'DIGIBLEN', 'ANIMALS', 'VEG',
    'TRAILA', 'TRAILARR', 'TRAILALI', 'TRAILB', 'TRAILBRR', 'TRAILBLI', 'WAIS', 'BOSTON',
    'UDSVERFC', 'UDSVERFN', 'UDSVERNF', 'UDSVERLC', 'UDSVERLR', 'UDSVERLN', 'UDSVERTN',
    'UDSVERTE', 'UDSVERTI', 'COGSTAT', 'NACCC1', 'MOCACOMP', 'MOCAREAS', 'MOCALOC',
    'MOCALAN', 'MOCALANX', 'MOCAVIS', 'MOCAHEAR', 'MOCATOTS', 'NACCMOCA', 'MOCATRAI',
    'MOCACUBE', 'MOCACLOC', 'MOCACLON', 'MOCACLOH', 'MOCANAMI', 'MOCAREGI', 'MOCADIGI',
    'MOCALETT', 'MOCASER7', 'MOCAREPE', 'MOCAFLUE', 'MOCAABST', 'MOCARECN', 'MOCARECC',
    'MOCARECR', 'MOCAORDT', 'MOCAORMO', 'MOCAORYR', 'MOCAORDY', 'MOCAORPL', 'MOCAORCT',
    'CRAFTVRS', 'CRAFTURS', 'DIGFORCT', 'DIGFORSL', 'DIGBACCT', 'DIGBACLS', 'CRAFTDVR',
    'CRAFTDRE', 'CRAFTDTI', 'CRAFTCUE', 'MINTTOTS', 'MINTTOTW', 'MINTSCNG', 'MINTSCNC',
    'MINTPCNG', 'MINTPCNC', 'NACCC2', 'MODCOMM', 'MOCBTOTS', 'NACCMOCB', 'REY1REC',
    'REY1INT', 'REY2REC', 'REY2INT', 'REY3REC', 'REY3INT', 'REY4REC', 'REY4INT',
    'REY5REC', 'REY5INT', 'REY6REC', 'REY6INT', 'OTRAILA', 'OTRLARR', 'OTRLALI',
    'OTRAILB', 'OTRLBRR', 'OTRLBLI', 'REYDREC', 'REYDINT', 'REYTCOR', 'REYFPOS',
    'VNTTOTW', 'VNTPCNC', 'RESPVAL', 'RESPHEAR', 'RESPDIST', 'RESPINTR', 'RESPDISN',
    'RESPFATG', 'RESPEMOT', 'RESPASST', 'RESPOTH', 'RESPOTHX',
    'B9CHG', 'DECSUB', 'DECIN', 'DECCLIN', 'DECCLCOG', 'COGMEM', 'COGORI', 'COGJUDG',
    'COGLANG', 'COGVIS', 'COGATTN', 'COGFLUC', 'COGFLAGO', 'COGOTHR', 'COGOTHRX',
    'NACCCOGF', 'NACCCGFX', 'COGMODE', 'COGMODEX', 'DECAGE', 'DECCLBE', 'BEAPATHY',
    'BEDEP', 'BEVHALL', 'BEVWELL', 'BEVHAGO', 'BEAHALL', 'BEDEL', 'BEDISIN', 'BEIRRIT',
    'BEAGIT', 'BEPERCH', 'BEREM', 'BEREMAGO', 'BEANX', 'BEOTHR', 'BEOTHRX', 'NACCBEHF',
    'NACCBEFX', 'BEMODE', 'BEMODEX', 'BEAGE', 'DECCLMOT', 'MOGAIT', 'MOFALLS', 'MOTREM',
    'MOSLOW', 'NACCMOTF', 'MOMODE', 'MOMODEX', 'MOMOPARK', 'PARKAGE', 'MOMOALS',
    'ALSAGE', 'MOAGE', 'COURSE', 'FRSTCHG', 'LBDEVAL', 'FTLDEVAL',
    'NACCNREX', 'NORMEXAM', 'FOCLDEF', 'GAITDIS', 'EYEMOVE', 'PARKSIGN', 'RESTTRL',
    'RESTTRR', 'SLOWINGL', 'SLOWINGR', 'RIGIDL', 'RIGIDR', 'BRADY', 'PARKGAIT',
    'POSTINST', 'CVDSIGNS', 'CORTDEF', 'SIVDFIND', 'CVDMOTL', 'CVDMOTR', 'CORTVISL',
    'CORTVISR', 'SOMATL', 'SOMATR', 'POSTCORT', 'PSPCBS', 'EYEPSP', 'DYSPSP', 'AXIALPSP',
    'GAITPSP', 'APRAXSP', 'APRAXL', 'APRAXR', 'CORTSENL', 'CORTSENR', 'ATAXL', 'ATAXR',
    'ALIENLML', 'ALIENLMR', 'DYSTONL', 'DYSTONR', 'MYOCLLT', 'MYOCLRT', 'ALSFIND',
    'GAITNPH', 'OTHNEUR', 'OTHNEURX',
    'TRESTFAC', 'TRESTFAX', 'TRESTRHD', 'TRESTRHX', 'TRESTLHD', 'TRESTLHX', 'TRESTRFT',
    'TRESTRFX', 'TRESTLFT', 'TRESTLFX', 'TRACTRHD', 'TRACTRHX', 'TRACTLHD', 'TRACTLHX',
    'RIGDNECK', 'RIGDNEX', 'RIGDUPRT', 'RIGDUPRX', 'RIGDUPLF', 'RIGDUPLX', 'RIGDLORT',
    'RIGDLORX', 'RIGDLOLF', 'RIGDLOLX', 'TAPSRT', 'TAPSRTX', 'TAPSLF', 'TAPSLFX',
    'HANDMOVR', 'HANDMVRX', 'HANDMOVL', 'HANDMVLX', 'HANDALTR', 'HANDATRX', 'HANDALTL',
    'HANDATLX', 'LEGRT', 'LEGRTX', 'LEGLF', 'LEGLFX', 'ARISING', 'ARISINGX', 'POSTURE',
    'POSTUREX', 'GAIT', 'GAITX', 'POSSTAB', 'POSSTABX', 'BRADYKIN', 'BRADYKIX', 'MEMORY',
    'ORIENT', 'JUDGMENT', 'COMMUN', 'HOMEHOBB', 'PERSCARE', 'CDRSUM', 'CDRGLOB',
    'COMPORT', 'CDRLANG', 'DEL', 'DELSEV', 'HALL', 'HALLSEV', 'AGIT', 'AGITSEV', 'DEPD',
    'DEPDSEV', 'ANX', 'ANXSEV', 'ELAT', 'ELATSEV', 'APA', 'APASEV', 'DISN', 'DISNSEV',
    'IRR', 'IRRSEV', 'MOT', 'MOTSEV', 'NITE', 'NITESEV', 'APP', 'APPSEV', 'NOGDS',
    'SATIS', 'DROPACT', 'EMPTY', 'BORED', 'SPIRITS', 'AFRAID', 'HAPPY', 'HELPLESS',
    'STAYHOME', 'MEMPROB', 'WONDRFUL', 'WRTHLESS', 'ENERGY', 'HOPELESS', 'BETTER',
    'NACCGDS', 'BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 'MEALPREP', 'EVENTS',
    'PAYATTN', 'REMDATES', 'TRAVEL',
    'NACCAM', 'NACCAMX', 'NACCAMS', 'NACCAMSX',
    'NACCFM', 'NACCFMX', 'NACCFMS', 'NACCFMSX',
    'NACCOM', 'NACCOMX', 'NACCOMS', 'NACCOMSX',
    'NACCFTD', 'NACCLBDM', 'TBI', 'TBIYEAR', 'CVHATT', 'HATTMULT', 'HATTYEAR', 'CVAFIB',
    'CBSTROKE', 'HXHYPER', 'HXSTROKE', 'PD', 'B12DEF', 'THYROID', 'BPSYSL', 'BPSYSR',
    'BPDIASL', 'BPDIASR', 'BPSYS', 'BPDIAS', 'NACCMRSA', 'NACCNMRI', 'NACCAPET', 'NACCNAPA',
    'NACCAPSA', 'NACCACSF', 'NACCPCSF', 'NACCTCSF', 'NACCAUTP', 'HEIGHT', 'WEIGHT',
    'NACCBMI', 'ANYMEDS', 'NACCAHTN', 'ADMUT', 'FTLDMUT', 'NACCFADM', 'NACCFFTD',
    'NACCREAS', 'INDEPEND',
    'CVANGIO', 'CVBYPASS', 'CVPACDEF', 'CVPACE', 'CVCHF', 'CVANGINA', 'CVHVALVE', 'CVOTHR', 'CVOTHRX',
    'STROKMUL', 'NACCSTYR', 'CBTIA', 'TIAMULT', 'NACCTIYR',
    'PDYR', 'PDOTHR', 'PDOTHRYR', 'SEIZURES',
    'NACCTBI', 'TBIBRIEF', 'TRAUMBRF', 'TBIEXTEN', 'TRAUMEXT', 'TBIWOLOS', 'TRAUMCHR', 'NCOTHR', 'NCOTHRX',
    'DIABETES', 'DIABTYPE', 'HYPERTEN', 'HYPERCHO',
    'ARTHRIT', 'ARTHTYPE', 'ARTHTYPX', 'ARTHUPEX', 'ARTHLOEX', 'ARTHSPIN', 'ARTUNKN',
    'INCONTU', 'INCONTF',
    'APNEA', 'RBD', 'INSOMN', 'OTHSLEEP', 'OTHSLEEX',
    'ALCOHOL', 'ABUSOTHR', 'ABUSX',
    'PTSD', 'BIPOLAR', 'SCHIZ', 'DEP2YRS', 'DEPOTHR', 'ANXIETY', 'OCD', 'PSYCDIS', 'PSYCDISX',
    'HRATE', 'VISION', 'VISCORR', 'VISWCORR', 'HEARING', 'HEARAID', 'HEARWAID',
    'ABRUPT', 'STEPWISE', 'SOMATIC', 'EMOT', 'FOCLSYM', 'FOCLSIGN', 'HACHIN', 'CVDCOG', 'STROKCOG',
    'CVDIMAG', 'CVDIMAG1', 'CVDIMAG2', 'CVDIMAG3', 'CVDIMAG4', 'CVDIMAGX',
    'PDNORMAL', 'SPEECH', 'SPEECHX', 'FACEXP', 'FACEXPX',
    'NACCAAAS', 'NACCAANX', 'NACCAC', 'NACCACEI', 'NACCADEP', 'NACCADMD', 'NACCAMD', 'NACCANGI',
    'NACCAPSY', 'NACCBETA', 'NACCCCBS', 'NACCDBMD', 'NACCDIUR', 'NACCEMD', 'NACCEPMD', 'NACCHTNC',
    'NACCLIPL', 'NACCNSD', 'NACCPDMD', 'NACCVASD',
    'ADGCGWAS', 'ADGCEXOM', 'ADGCRND', 'ADGCEXR', 'NGDSGWAS', 'NGDSEXOM', 'NGDSWGS', 'NGDSWES',
    'NGDSGWAC', 'NGDSEXAC', 'NGDSWGAC', 'NGDSWEAC', 'NACCNCRD', 'NACCAPOE', 'NACCNE4S',
    'NACCBRNN', 'NACCAVAS', 'NACCBRAA', 'NACCNEUR', 'NACCDIFF', 'NACCVASC', 'NACCAMY', 'NACCINF',
    'NACCMICR', 'NACCHEM', 'NACCARTE', 'NACCNEC', 'NACCLEWY', 'NACCPICK', 'NACCCBD', 'NACCPROG',
    'NACCPRIO', 'NACCDOWN', 'NACCOTHP', 'NACCWRI1', 'NACCWRI2', 'NACCWRI3',
    'NACCBNKF', 'NACCFORM', 'NACCPARA', 'NACCCSFP', 'NACCDAGE', 'NACCINT',
    'DRUG1', 'DRUG2', 'DRUG3', 'DRUG4', 'DRUG5', 'DRUG6', 'DRUG7', 'DRUG8', 'DRUG9', 'DRUG10',
    'DRUG11', 'DRUG12', 'DRUG13', 'DRUG14', 'DRUG15', 'DRUG16', 'DRUG17', 'DRUG18', 'DRUG19', 'DRUG20',
    'DRUG21', 'DRUG22', 'DRUG23', 'DRUG24', 'DRUG25', 'DRUG26', 'DRUG27', 'DRUG28', 'DRUG29', 'DRUG30',
    'DRUG31', 'DRUG32', 'DRUG33', 'DRUG34', 'DRUG35', 'DRUG36', 'DRUG37', 'DRUG38', 'DRUG39', 'DRUG40',
    'LANGA1', 'LANGA2', 'LANGA3', 'LANGA4', 'LANGA5', 'LANGB1', 'LANGB4', 'LANGB5',
    'LANGB6', 'LANGB7', 'LANGB8', 'LANGB9', 'LANGC2', 'LANGD1', 'LANGD2', 'LANGB3F',
    'LANGB9F', 'LANGC1F', 'LANGC2F', 'LANGC3F', 'LANGC4F', 'LANGC5F', 'LANGC6F',
    'LANGE2F', 'LANGE3F', 'LANGCLS', 'CLSSUB'
]
forbidden_medical_variables = list(set(forbidden_medical_variables))

modelX_df_cleaned = original_df.drop(columns=forbidden_medical_variables, errors='ignore').copy()
modelX_df_final = modelX_df_cleaned.drop(columns=['NACCREFR'], errors='ignore')



  modelX_df = pd.read_csv('Dementia Prediction Dataset.csv')


In [6]:
modelX_df_final['AGE'] = modelX_df_final['VISITYR'] - modelX_df_final['BIRTHYR']

categorical_features = [
    'SEX', 'MARISTAT', 'NACCLIVS', 'RESIDENC', 'HANDED', 'HISPANIC', 'RACE',
    'PRIMLANG', 'INSEX', 'INRELTO'
]
categorical_features = [col for col in categorical_features if col in modelX_df_final.columns]

columns_to_drop = [
    'NACCID', 'NACCADC', 'PACKET', 'FORMVER', 'VISITMO', 'VISITDAY', 'NACCVNUM',
    'NACCAVST', 'NACCNVST', 'NACCDAYS', 'NACCFDYS', 'NACCCORE',
    'VISITYR', 'BIRTHYR', 'BIRTHMO',
    'HISPORX', 'RACEX', 'RACESECX', 'RACETERX', 'PRIMLANX', 'INHISPOX',
    'INRACEX', 'INRASECX', 'INRATERX', 'INRELTOX',
    'HISPOR', 'RACESEC', 'RACETER',
    'INBIRMO', 'INBIRYR', 'NEWINF', 'INHISP', 'INHISPOR', 'NACCNINR',
    'INRACE', 'INRASEC', 'INRATER',
    'NACCMOD', 'NACCYOD', 'NACCDSMO', 'NACCDSDY', 'NACCDSYR',
    'NACCNRMO', 'NACCNRDY', 'NACCNRYR'
]

modelX_df_engineered = pd.get_dummies(modelX_df_final, columns=categorical_features, dummy_na=True)
modelX_df_engineered = modelX_df_engineered.drop(columns=columns_to_drop, errors='ignore')

special_codes = [-4, 8, 9, 88, 99, 888, 999, 8888, 9999]
modelX_df_engineered.replace(special_codes, np.nan, inplace=True)

cols_to_impute_zero = [
    'INEDUC', 'INKNOWN', 'INLIVWTH', 'INVISITS', 'INCALLS', 'INRELY',
    'TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK',
    'ALCOCCAS', 'ALCFREQ'
]
cols_to_impute_zero = [col for col in cols_to_impute_zero if col in modelX_df_engineered.columns]
modelX_df_engineered[cols_to_impute_zero] = modelX_df_engineered[cols_to_impute_zero].fillna(0)

cols_to_impute_median = [
    'AGE', 'EDUC', 'NACCAGEB', 'NACCNIHR', 'NACCAGE', 'NACCDIED', 'NACCACTV',
    'NACCNOVS', 'NACCNURP', 'NACCMDSS', 'NACCPAFF', 'TELCOV', 'TELMOD',
    'NACCFAM', 'NACCMOM', 'NACCDAD'
]
cls_cols = ['NACCSPNL', 'NACCENGL', 'APREFLAN', 'AYRSPAN', 'AYRENGL',
            'APCSPAN', 'APCENGL', 'ASPKSPAN', 'AREASPAN', 'AWRISPAN',
            'AUNDSPAN', 'ASPKENGL', 'AREAENGL', 'AWRIENGL', 'AUNDENGL']
cols_to_impute_median = [col for col in cols_to_impute_median + cls_cols if col in modelX_df_engineered.columns]

if cols_to_impute_median:
    missing_counts = modelX_df_engineered[cols_to_impute_median].isnull().sum()
    if missing_counts.sum() > 0:
        imputer = SimpleImputer(strategy='median')
        modelX_df_engineered[cols_to_impute_median] = imputer.fit_transform(modelX_df_engineered[cols_to_impute_median])

modelX_df_100_PERCENT_FINAL = modelX_df_engineered

current_df = modelX_df_100_PERCENT_FINAL

final_forbidden_cols = ['ARTHUNK', 'ARTLOEX', 'ARTSPIN', 'NPTAUHAP']
df_clean_92 = current_df.drop(columns=final_forbidden_cols, errors='ignore')
modelX_df_100_PERCENT_FINAL = df_clean_92

In [7]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score, accuracy_score



X_features = modelX_df_100_PERCENT_FINAL
y_target = modelX_df['DEMENTED']
valid_indices = y_target.isin([0, 1])
X_final = X_features[valid_indices].copy()
y_final = y_target[valid_indices].copy()

df_for_viz = X_final.copy()
df_for_viz['DEMENTED'] = y_final

X = X_final
y = y_final

scale_pos_weight = (y == 0).sum() / (y == 1).sum()

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [5, 10]
}

xgb = XGBClassifier(random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight)
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 8],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.8, 1.0]
}

lgbm = LGBMClassifier(random_state=42, n_jobs=-1, scale_pos_weight=scale_pos_weight, verbosity=-1)
lgbm_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 8],
    'learning_rate': [0.1, 0.2],
    'num_leaves': [20, 30]
}

models = [
    ("RandomForest", rf, rf_params),
    ("XGBoost", xgb, xgb_params),
    ("LightGBM", lgbm, lgbm_params)
]

In [10]:

best_estimators = {}
for name, model, params in models:
    print(f"\n RUNNING: {name} ")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=10,
        cv=3,
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    search.fit(X_train, y_train)
    best_estimators[name] = search.best_estimator_
    print(f" Best F1-Score for {name} (on train data): {search.best_score_:.4f} ")


for name, model in best_estimators.items():
    print(f"\n RESULTS FOR: {name} ")
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]


    acc = accuracy_score(y_test, y_pred)
    auroc = roc_auc_score(y_test, y_pred_proba)


    print(classification_report(y_test, y_pred, target_names=['0 (No Dementia)', '1 (Dementia)']))
    print(f"Accuracy Score: {acc:.4f}")
    print(f"Area Under ROC Curve (AUROC): {auroc:.4f}")


--- RUNNING: RandomForest ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits
--- Best F1-Score for RandomForest (on train data): 0.6045 ---

--- RUNNING: XGBoost ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits
--- Best F1-Score for XGBoost (on train data): 0.6943 ---

--- RUNNING: LightGBM ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits
--- Best F1-Score for LightGBM (on train data): 0.6723 ---

--- RESULTS FOR: RandomForest ---
                 precision    recall  f1-score   support

0 (No Dementia)       0.83      0.95      0.88      5195
   1 (Dementia)       0.80      0.49      0.61      2056

       accuracy                           0.82      7251
      macro avg       0.81      0.72      0.75      7251
   weighted avg       0.82      0.82      0.81      7251

Accuracy Score: 0.8213
Area Under ROC Curve (AUROC): 0.8679

--- RESULTS FOR: XGBoost ---
                 precision    recall  f1-score   support

0 (No Dementia)       0.8