In [1]:
# import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# load in processed data
file_path = 'data_pad.csv' #interpolate_method_1
df_survey = pd.read_csv(file_path)
df_survey.drop(columns='Unnamed: 0', inplace=True)
df_survey.head()

Unnamed: 0,RaceEth,Record,Age,Sex,Grade,Hispanic,Height,Weight2,Seatbelt_Use,Driving_DUI_Other,...,BMIPCT,Weight,Stratum,PSU,Suicide_attempt_dummy,white,AIN,asian,black,PI
0,7.0,1.0,5.0,2.0,2.0,1.0,1.63,54.89,4.0,1.0,...,46.88,1.66,213.0,57923.0,1.0,0,1,0,0,0
1,8.0,2.0,4.0,2.0,2.0,2.0,1.6,53.98,5.0,2.0,...,62.23,1.38,213.0,57923.0,1.0,0,1,0,0,1
2,8.0,3.0,4.0,1.0,2.0,2.0,1.68,43.09,4.0,1.0,...,0.59,1.49,213.0,57923.0,1.0,1,0,1,0,0
3,5.0,4.0,4.0,2.0,2.0,2.0,1.78,68.95,4.0,1.0,...,69.78,1.71,213.0,57923.0,1.0,1,0,0,0,0
4,6.0,5.0,5.0,2.0,2.0,1.0,1.78,58.97,4.0,1.0,...,16.73,1.66,213.0,57923.0,1.0,0,0,0,0,0


In [5]:
# dropping question 29 since it seems to rely on question 28
df_survey.drop(columns=['Suicide_attempt_injury'], inplace=True)
# Dropping record column
df_survey.drop(columns=['Record'], inplace=True)

In [7]:
# dropping the original target column
df_survey.drop(columns=['Suicide_attempt'], inplace=True)

In [8]:
# target standardization check
print(df_survey['Suicide_attempt_dummy'].max())
print(df_survey['Suicide_attempt_dummy'].min())

1.0
0.0


In [9]:
# checking column types
df_survey.dtypes

RaceEth     float64
Age         float64
Sex         float64
Grade       float64
Hispanic    float64
             ...   
white         int64
AIN           int64
asian         int64
black         int64
PI            int64
Length: 107, dtype: object

In [10]:
df_survey.select_dtypes('object').head()

0
1
2
3
4


In [11]:
# temporarily drop nas
# df_survey.dropna().head()

In [13]:
# defining features
X = df_survey.copy()
X = X.drop('Suicide_attempt_dummy', axis=1)

In [14]:
# defining target
y = df_survey['Suicide_attempt_dummy'].ravel()
y[:5]

array([1., 1., 1., 1., 1.])

In [15]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [16]:
# scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# create the random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [18]:
# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
# make predictions
predictions = rf_model.predict(X_test_scaled)

In [20]:
# calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual False', 'Actual True'], columns=['Predicted False', 'Predicted True']
    )
cm_df

Unnamed: 0,Predicted False,Predicted True
Actual False,515,299
Actual True,10,2596


In [21]:
# display results
print('CONFUSION MATRIX')
display(cm_df)
print('CLASSIFICATION REPORT')
print(classification_report(y_test, predictions))

CONFUSION MATRIX


Unnamed: 0,Predicted False,Predicted True
Actual False,515,299
Actual True,10,2596


CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.98      0.63      0.77       814
         1.0       0.90      1.00      0.94      2606

    accuracy                           0.91      3420
   macro avg       0.94      0.81      0.86      3420
weighted avg       0.92      0.91      0.90      3420



In [22]:
# calculate feature importance 
importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances

[(0.0656737186805402, 'Condom_use'),
 (0.06034285796485131, 'Seatbelt_Use'),
 (0.05317625031046816, 'Tobacco_frequency_total'),
 (0.04894958461285657, 'Victim_rape'),
 (0.04108209976742026, 'Physical_fight'),
 (0.02823228268552291, 'Drug_use_frequency'),
 (0.02774480966616233, 'Sunscreen_use'),
 (0.027731250195055204, 'Carry_Weapon'),
 (0.024919072896693976, 'Victim_rape_relationship'),
 (0.0240721719726378, 'Tobacco_quit'),
 (0.02204971778840254, 'PSU'),
 (0.021207138337438546, 'Smoke_cigarette'),
 (0.018426182167009007, 'Stratum'),
 (0.018234934753128505, 'Victim_rape_12mos'),
 (0.01783584992695716, 'Carry_Gun'),
 (0.017393375509005642, 'Alcohol_acquire'),
 (0.01445726835679695, 'Sexual_orientation'),
 (0.014368714973319393, 'Tanning_Device_frequency'),
 (0.014058485341178881, 'Weight'),
 (0.01268179046715655, 'TV_frequency'),
 (0.012326121349487379, 'Nutrition_potatoes'),
 (0.011627150313049698, 'BMIPCT'),
 (0.011310608494041488, 'Grades_past_year'),
 (0.010438875755852325, 'Weight2

In [24]:
importances = pd.Series(importances)
importances.to_csv('feature_importance.csv', index=False, header=False)