In [1]:
# import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# load in processed data
file_path = 'data_pad.csv' #interpolate_method_1
df_survey = pd.read_csv(file_path)
df_survey.drop(columns='Unnamed: 0', inplace=True)
df_survey.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,raceeth,q6orig,q7orig,record,q1,q2,q3,q4,q6,q7,...,q99,BMIPCT,weight,stratum,psu,white,AIN,asian,black,PI
0,7.0,504.0,121.0,1.0,5.0,2.0,2.0,1.0,1.63,54.89,...,1.0,46.88,1.66,213.0,57923.0,0,1,0,0,0
1,8.0,503.0,119.0,2.0,4.0,2.0,2.0,2.0,1.6,53.98,...,1.0,62.23,1.38,213.0,57923.0,0,1,0,0,1
2,8.0,506.0,95.0,3.0,4.0,1.0,2.0,2.0,1.68,43.09,...,1.0,0.59,1.49,213.0,57923.0,1,0,1,0,0
3,5.0,510.0,152.0,4.0,4.0,2.0,2.0,2.0,1.78,68.95,...,1.0,69.78,1.71,213.0,57923.0,1,0,0,0,0
4,6.0,510.0,130.0,5.0,5.0,2.0,2.0,1.0,1.78,58.97,...,2.0,16.73,1.66,213.0,57923.0,0,0,0,0,0


In [3]:
# target cleanup
df_survey.loc[df_survey['q28'] > 0, 'q28'] = 1
print(df_survey['q28'].max())
print(df_survey['q28'].min())

1.0
0.0


In [4]:
df_survey.dtypes

raceeth    float64
q6orig      object
q7orig      object
record     float64
q1         float64
            ...   
white        int64
AIN          int64
asian        int64
black        int64
PI           int64
Length: 111, dtype: object

In [5]:
df_survey.select_dtypes('object').head()

Unnamed: 0,q6orig,q7orig
0,504.0,121.0
1,503.0,119.0
2,506.0,95.0
3,510.0,152.0
4,510.0,130.0


In [6]:
df_survey['q6orig'] = pd.to_numeric(df_survey['q6orig'],errors='coerce')
df_survey['q7orig'] = pd.to_numeric(df_survey['q7orig'],errors='coerce')
df_survey.drop(columns=['q6orig', 'q7orig'], inplace=True)

In [7]:
df_survey.select_dtypes('object').head()

0
1
2
3
4


In [8]:
# temporarily drop nas
df_survey.dropna().head()

Unnamed: 0,raceeth,record,q1,q2,q3,q4,q6,q7,q8,q9,...,q99,BMIPCT,weight,stratum,psu,white,AIN,asian,black,PI
0,7.0,1.0,5.0,2.0,2.0,1.0,1.63,54.89,4.0,1.0,...,1.0,46.88,1.66,213.0,57923.0,0,1,0,0,0
1,8.0,2.0,4.0,2.0,2.0,2.0,1.6,53.98,5.0,2.0,...,1.0,62.23,1.38,213.0,57923.0,0,1,0,0,1
2,8.0,3.0,4.0,1.0,2.0,2.0,1.68,43.09,4.0,1.0,...,1.0,0.59,1.49,213.0,57923.0,1,0,1,0,0
3,5.0,4.0,4.0,2.0,2.0,2.0,1.78,68.95,4.0,1.0,...,1.0,69.78,1.71,213.0,57923.0,1,0,0,0,0
4,6.0,5.0,5.0,2.0,2.0,1.0,1.78,58.97,4.0,1.0,...,2.0,16.73,1.66,213.0,57923.0,0,0,0,0,0


In [9]:
# dropping question 29 since it seems to rely on question 28
df_survey.drop(columns=['q29'], inplace=True)
# Dropping record column
df_survey.drop(columns=['record'], inplace=True)

In [10]:
# defining features
X = df_survey.copy()
X = X.drop('q28', axis=1)

In [11]:
# defining target
y = df_survey['q28'].ravel()
y[:5]

array([1., 1., 1., 1., 1.])

In [12]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# create the random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [15]:
# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# make predictions
predictions = rf_model.predict(X_test_scaled)

In [17]:
# calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual False', 'Actual True'], columns=['Predicted False', 'Predicted True']
    )
cm_df

Unnamed: 0,Predicted False,Predicted True
Actual False,515,299
Actual True,10,2596


In [18]:
# display results
print('CONFUSION MATRIX')
display(cm_df)
print('CLASSIFICATION REPORT')
print(classification_report(y_test, predictions))

CONFUSION MATRIX


Unnamed: 0,Predicted False,Predicted True
Actual False,515,299
Actual True,10,2596


CLASSIFICATION REPORT
              precision    recall  f1-score   support

         0.0       0.98      0.63      0.77       814
         1.0       0.90      1.00      0.94      2606

    accuracy                           0.91      3420
   macro avg       0.94      0.81      0.86      3420
weighted avg       0.92      0.91      0.90      3420



In [19]:
# store accuracy, recall, and f1 score variables
basic_1_model_accuracy = accuracy
#basic_1_model_recall = recall
#basic_1_model_f1 = f1 score

NameError: name 'accuracy' is not defined

In [None]:
# 1st pause and evaluate
recall of “1” value
Is the recall score greater than .75?
If yes, initial pass succeeds.
If no, initial pass fails. 

If it succeeds….    move on to feature importance and begin identifying and removing values that do not influence the output very much. determine if that improves performance

If it fails…
If this is your first or second interpolate method pass through, return to top and interpolate through another unused method.
If this is your third time, move on to feature importance and begin identifying and removing values that do not influence the output very much

In [None]:
# calculate feature importance 
importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances

In [None]:
# drop less important columns
X = X.drop(“less important columns”)

In [None]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# create the random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [None]:
# fit the model
rf_model = rf_model.fit(X_scaled, y_train)

In [None]:
# make predictions
predictions = rf_model.predict(X_test_scaled)

In [None]:
# calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
	cm, index=[“Actual False”, “Actual True”], columns=[“Predicted False”, “Predicted True”]
	)
cm_df

In [None]:
# display results
print(“CONFUSION MATRIX”)
display(cm_df)
print(“CLASSIFICATION REPORT”)
print(classification_report(y_test, predictions))

In [None]:
# store accuracy, recall, and f1 score variables
feature_model_accuracy = accuracy
feature_model_recall = recall
feature_model_f1 = f1 score

In [None]:
# store accuracy, recall, and f1 score variables
basic_1_model_accuracy = accuracy
basic_1_model_recall = recall
basic_1_model_f1 = f1 score

In [None]:
# 
data = [ ,index=[‘Accuracy’, ‘Recall’, ‘F1 Score’]

In [None]:
df = pd.DataFrame([1, 2, 3])

from pathlib import Path  

filepath = Path('data/out.csv')  

df.to_csv(filepath)  