In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df2015 = pd.read_sas("LLCP2015.XPT")
print(df2015.shape)
df2015.head(5)

(441456, 330)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENUM,...,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3
0,1.0,1.0,b'01292015',b'01',b'29',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
1,1.0,1.0,b'01202015',b'01',b'20',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,2.0,2.0,3.0,3.0,4.0,2.0,2.0,,,2.0
2,1.0,1.0,b'02012015',b'02',b'01',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,
3,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,9.0
4,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0


### clean data a bit

In [3]:
# filter 5 features: HighBP, GenHlth, BMI, Age, HighChol
brfss_df = df2015[['_RFHYPE5', 'GENHLTH', '_BMI5', '_AGEG5YR', 'TOLDHI2', 'DIABETE3']]
print(brfss_df.shape)
brfss_df.head(5)

(441456, 6)


Unnamed: 0,_RFHYPE5,GENHLTH,_BMI5,_AGEG5YR,TOLDHI2,DIABETE3
0,2.0,5.0,4018.0,9.0,1.0,3.0
1,1.0,3.0,2509.0,7.0,2.0,3.0
2,1.0,4.0,2204.0,11.0,1.0,3.0
3,2.0,5.0,2819.0,9.0,1.0,3.0
4,1.0,5.0,2437.0,9.0,2.0,3.0


In [4]:
# drop missing data
brfss_df_selected = brfss_df.dropna()
brfss_df_selected.shape

(351939, 6)

In [5]:
# DIABETE3
# going to make this ordinal. 0 is for no diabetes or only during pregnancy, 1 is for pre-diabetes or borderline diabetes or yes diabetes
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss_df_selected['DIABETE3'] = brfss_df_selected['DIABETE3'].replace({2:0, 3:0, 4:1})
brfss_df_selected = brfss_df_selected[brfss_df_selected.DIABETE3 != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.DIABETE3 != 9]
brfss_df_selected.DIABETE3.unique()

array([0., 1.])

In [6]:
# _RFHYPE5
#Change 1 to 0 so it represetnts No high blood pressure and 2 to 1 so it represents high blood pressure
brfss_df_selected['_RFHYPE5'] = brfss_df_selected['_RFHYPE5'].replace({1:0, 2:1})
brfss_df_selected = brfss_df_selected[brfss_df_selected._RFHYPE5 != 9]
brfss_df_selected._RFHYPE5.unique()

array([1., 0.])

In [7]:
# TOLDHI2
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss_df_selected['TOLDHI2'] = brfss_df_selected['TOLDHI2'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.TOLDHI2 != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.TOLDHI2 != 9]
brfss_df_selected.TOLDHI2.unique()

array([1., 0.])

In [8]:
#_BMI5 (no changes, just note that these are BMI * 100. So for example a BMI of 4018 is really 40.18)
brfss_df_selected['_BMI5'] = brfss_df_selected['_BMI5'].div(100).round(0)
brfss_df_selected._BMI5.unique()

array([40., 25., 22., 28., 24., 27., 34., 30., 26., 23., 31., 33., 21.,
       38., 20., 19., 32., 46., 41., 37., 36., 29., 35., 18., 54., 45.,
       39., 47., 16., 43., 55., 49., 42., 17., 48., 44., 50., 59., 15.,
       52., 53., 57., 51., 14., 58., 63., 61., 56., 60., 74., 62., 64.,
       13., 66., 73., 65., 68., 85., 71., 84., 67., 70., 82., 79., 92.,
       72., 88., 96., 81., 12., 77., 95., 69., 75., 91., 76., 87., 89.,
       83., 98., 86., 80., 90., 78., 97.])

In [9]:
# GENHLTH
# This is an ordinal variable that I want to keep (1 is Excellent -> 5 is Poor)
# Remove 7 and 9 for don't know and refused
brfss_df_selected = brfss_df_selected[brfss_df_selected.GENHLTH != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.GENHLTH != 9]
brfss_df_selected.GENHLTH.unique()

array([5., 3., 4., 2., 1.])

In [10]:
# _AGEG5YR
# already ordinal. 1 is 18-24 all the way up to 13 wis 80 and older. 5 year increments.
# remove 14 because it is don't know or missing
brfss_df_selected = brfss_df_selected[brfss_df_selected._AGEG5YR != 14]
brfss_df_selected._AGEG5YR.unique()

array([ 9.,  7., 11., 13., 10., 12.,  8.,  4.,  6.,  2.,  5.,  1.,  3.])

In [11]:
brfss_df_selected.shape

(344940, 6)

In [12]:
# sample 20%
data = brfss_df_selected.sample(frac =.2)
print(data.shape)
data.head(5)

(68988, 6)


Unnamed: 0,_RFHYPE5,GENHLTH,_BMI5,_AGEG5YR,TOLDHI2,DIABETE3
4119,0.0,3.0,24.0,10.0,0.0,0.0
314226,1.0,4.0,30.0,12.0,0.0,0.0
372081,0.0,1.0,27.0,8.0,0.0,0.0
76649,0.0,2.0,22.0,10.0,0.0,0.0
95104,0.0,2.0,28.0,11.0,0.0,0.0


In [13]:
data.rename(columns = {'_RFHYPE5':'HighBP', '_BMI5':'BMI', '_AGEG5YR': 'Age', 'TOLDHI2':'HighChol', 'DIABETE3': 'D'}, inplace = True)
data.head(5)

Unnamed: 0,HighBP,GENHLTH,BMI,Age,HighChol,D
4119,0.0,3.0,24.0,10.0,0.0,0.0
314226,1.0,4.0,30.0,12.0,0.0,0.0
372081,0.0,1.0,27.0,8.0,0.0,0.0
76649,0.0,2.0,22.0,10.0,0.0,0.0
95104,0.0,2.0,28.0,11.0,0.0,0.0


In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split

In [15]:
x = data.drop('D', axis = 1)
y = data['D']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [16]:
print(x_train.shape)
x_train.head(3)

(48291, 5)


Unnamed: 0,HighBP,GENHLTH,BMI,Age,HighChol
367723,0.0,2.0,29.0,9.0,0.0
126571,0.0,3.0,27.0,13.0,1.0
184996,1.0,3.0,30.0,13.0,0.0


In [17]:
svm = SVC(kernel = 'rbf', C = 100.0)
svm.fit(x_train, y_train)
pred = svm.predict(x_test)

In [18]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     17312
         1.0       0.67      0.05      0.09      3385

    accuracy                           0.84     20697
   macro avg       0.75      0.52      0.50     20697
weighted avg       0.81      0.84      0.78     20697



In [2]:
df = pd.read_sas("LLCP2022.XPT")  
print(df.shape)
df.head(5)

(445132, 328)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_SMOKGRP,_LCSREC,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,1.0,b'02032022',b'02',b'03',b'2022',1100.0,b'2022000001',2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,1.0,b'02042022',b'02',b'04',b'2022',1100.0,b'2022000002',2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0
2,1.0,1.0,b'02022022',b'02',b'02',b'2022',1100.0,b'2022000003',2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,2.0
3,1.0,1.0,b'02032022',b'02',b'03',b'2022',1100.0,b'2022000004',2022000000.0,1.0,...,3.0,2.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,9.0,9.0,2.0
4,1.0,1.0,b'02022022',b'02',b'02',b'2022',1100.0,b'2022000005',2022000000.0,1.0,...,4.0,,1.0,10.0,1.0,140.0,1.0,,,2.0


In [3]:
# sample 10%
df_new = df.sample(frac =.1)

# filter 5 features: HighBP, GenHlth, BMI, Age, HighChol
df_filter = df_new[['_RFHYPE5', 'GENHLTH', '_BMI5', '_AGEG5YR', 'TOLDHI2']]
print(df_filter.shape)
df_filter.head(5)

KeyError: "['_RFHYPE5', 'TOLDHI2'] not in index"

In [4]:
df.columns.values.tolist()

['_STATE',
 'FMONTH',
 'IDATE',
 'IMONTH',
 'IDAY',
 'IYEAR',
 'DISPCODE',
 'SEQNO',
 '_PSU',
 'CTELENM1',
 'PVTRESD1',
 'COLGHOUS',
 'STATERE1',
 'CELPHON1',
 'LADULT1',
 'COLGSEX1',
 'NUMADULT',
 'LANDSEX1',
 'NUMMEN',
 'NUMWOMEN',
 'RESPSLCT',
 'SAFETIME',
 'CTELNUM1',
 'CELLFON5',
 'CADULT1',
 'CELLSEX1',
 'PVTRESD3',
 'CCLGHOUS',
 'CSTATE1',
 'LANDLINE',
 'HHADULT',
 'SEXVAR',
 'GENHLTH',
 'PHYSHLTH',
 'MENTHLTH',
 'POORHLTH',
 'PRIMINSR',
 'PERSDOC3',
 'MEDCOST1',
 'CHECKUP1',
 'EXERANY2',
 'SLEPTIM1',
 'LASTDEN4',
 'RMVTETH4',
 'CVDINFR4',
 'CVDCRHD4',
 'CVDSTRK3',
 'ASTHMA3',
 'ASTHNOW',
 'CHCSCNC1',
 'CHCOCNC1',
 'CHCCOPD3',
 'ADDEPEV3',
 'CHCKDNY2',
 'HAVARTH4',
 'DIABETE4',
 'DIABAGE4',
 'MARITAL',
 'EDUCA',
 'RENTHOM1',
 'NUMHHOL4',
 'NUMPHON4',
 'CPDEMO1C',
 'VETERAN3',
 'EMPLOY1',
 'CHILDREN',
 'INCOME3',
 'PREGNANT',
 'WEIGHT2',
 'HEIGHT3',
 'DEAF',
 'BLIND',
 'DECIDE',
 'DIFFWALK',
 'DIFFDRES',
 'DIFFALON',
 'HADMAM',
 'HOWLONG',
 'CERVSCRN',
 'CRVCLCNC',
 'CRVCLPAP',
 

In [5]:
df1 = pd.read_sas("LLCP2021.XPT")  
print(df.shape)
#df.head(5)

(445132, 328)
