In [1]:
# import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# load in processed data
file_path = 'data_pad.csv' #interpolate_method_1
df_survey = pd.read_csv(file_path)
df_survey.drop(columns='Unnamed: 0', inplace=True)
df_survey.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,raceeth,q6orig,q7orig,record,q1,q2,q3,q4,q6,q7,...,q99,BMIPCT,weight,stratum,psu,white,AIN,asian,black,PI
0,7.0,504.0,121.0,1.0,5.0,2.0,2.0,1.0,1.63,54.89,...,1.0,46.88,1.66,213.0,57923.0,0,1,0,0,0
1,8.0,503.0,119.0,2.0,4.0,2.0,2.0,2.0,1.6,53.98,...,1.0,62.23,1.38,213.0,57923.0,0,1,0,0,1
2,8.0,506.0,95.0,3.0,4.0,1.0,2.0,2.0,1.68,43.09,...,1.0,0.59,1.49,213.0,57923.0,1,0,1,0,0
3,5.0,510.0,152.0,4.0,4.0,2.0,2.0,2.0,1.78,68.95,...,1.0,69.78,1.71,213.0,57923.0,1,0,0,0,0
4,6.0,510.0,130.0,5.0,5.0,2.0,2.0,1.0,1.78,58.97,...,2.0,16.73,1.66,213.0,57923.0,0,0,0,0,0


In [3]:
# removing unneeded columns
df_survey['q6orig'] = pd.to_numeric(df_survey['q6orig'],errors='coerce')
df_survey['q7orig'] = pd.to_numeric(df_survey['q7orig'],errors='coerce')
df_survey.drop(columns=['q6orig', 'q7orig'], inplace=True)
# dropping question 29 since it seems to rely on question 28
df_survey.drop(columns=['q29'], inplace=True)
# Dropping record column
df_survey.drop(columns=['record'], inplace=True)

In [4]:
# target standardization
df_survey.loc[df_survey['q28'] > 0, 'q28'] = 1
print(df_survey['q28'].max())
print(df_survey['q28'].min())

1.0
0.0


In [5]:
# checking column types
df_survey.dtypes

raceeth    float64
q1         float64
q2         float64
q3         float64
q4         float64
            ...   
white        int64
AIN          int64
asian        int64
black        int64
PI           int64
Length: 107, dtype: object

In [6]:
df_survey.select_dtypes('object').head()

0
1
2
3
4


In [7]:
# temporarily drop nas
# df_survey.dropna().head()

In [8]:
# defining features
X = df_survey.copy()
X = X.drop('q28', axis=1)

In [9]:
# defining target
y = df_survey['q28'].ravel()
y[:5]

array([1., 1., 1., 1., 1.])

In [10]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# create the random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [13]:
# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
# make predictions
predictions = rf_model.predict(X_test_scaled)

In [15]:
# calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual False', 'Actual True'], columns=['Predicted False', 'Predicted True']
    )
cm_df

Unnamed: 0,Predicted False,Predicted True
Actual False,515,299
Actual True,10,2596


In [16]:
# display results
print('CONFUSION MATRIX')
display(cm_df)
print('CLASSIFICATION REPORT')
print(classification_report(y_test, predictions))

CONFUSION MATRIX


Unnamed: 0,Predicted False,Predicted True
Actual False,515,299
Actual True,10,2596


CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.98      0.63      0.77       814
         1.0       0.90      1.00      0.94      2606

    accuracy                           0.91      3420
   macro avg       0.94      0.81      0.86      3420
weighted avg       0.92      0.91      0.90      3420



In [17]:
# calculate feature importance 
importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances

[(0.0656737186805402, 'q65'),
 (0.06034285796485131, 'q8'),
 (0.05317625031046816, 'q33'),
 (0.04894958461285657, 'q19'),
 (0.04108209976742026, 'q17'),
 (0.02823228268552291, 'q51'),
 (0.02774480966616233, 'q93'),
 (0.027731250195055204, 'q12'),
 (0.024919072896693976, 'q21'),
 (0.0240721719726378, 'q39'),
 (0.02204971778840254, 'psu'),
 (0.021207138337438546, 'q30'),
 (0.018426182167009007, 'stratum'),
 (0.018234934753128505, 'q20'),
 (0.01783584992695716, 'q14'),
 (0.017393375509005642, 'q44'),
 (0.01445726835679695, 'q68'),
 (0.014368714973319393, 'q92'),
 (0.014058485341178881, 'weight'),
 (0.01268179046715655, 'q85'),
 (0.012326121349487379, 'q74'),
 (0.011627150313049698, 'BMIPCT'),
 (0.011310608494041488, 'q97'),
 (0.010438875755852325, 'q7'),
 (0.009318499907096952, 'q6'),
 (0.009185868592067454, 'q82'),
 (0.008722650600675529, 'q76'),
 (0.008231721806905947, 'q26'),
 (0.00817311277492734, 'q88'),
 (0.007617969617467362, 'q71'),
 (0.007617282035447217, 'q69'),
 (0.007566724990

In [22]:
importances = pd.Series(importances)
importances.to_csv('feature_importance.csv', index=False, header=False)