# Final modeling notebook

In [12]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 150, 'display.max_rows', 150)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve, auc, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics

**KNN**

In [13]:
df = pd.read_csv('final_df.csv')

**Defining our features and targer variable**

In [14]:
features_0 = ['YEAR', 'raceeth', 'How old are you', 'What is your sex',
       'In what grade are you','Seat belt use', 'Riding with a drinking driver',
       'Drinking and driving', 'Weapon carrying', 'Weapon carrying at school',
       'Safety concerns at school', 'Threatened at school',
       'Physical fighting', 'Physical fighting at school',
       'Forced sexual intercourse', 'Bullying at school',
       'Electronic bullying', 'Sad or hopeless', 'Considered suicide',
       'Made a suicide plan', 'Attempted suicide', 'Injurious suicide attempt',
       'Ever cigarette use', 'Initiation of cigarette smoking',
       'Current cigarette use', 'Current smokeless tobacco use',
       'Current cigar use', 'Initiation of alcohol use', 'Current alcohol use',
       'Source of alcohol', 'Ever marijuana use',
       'Initiation of marijuana use', 'Current marijuana use',
       'Ever steroid use', 'Illegal injected drug use',
       'Illegal drugs at school', 'Ever sexual intercourse',
       'Sex before 13 years', 'Multiple sex partners',
       'Current sexual activity', 'Alcohol/drugs and sex', 
        'Perception of weight', 'Weight loss',
       'Television watching', 'Computer use', 'HIV testing', 'Asthma', 'Sleep',
       'Ever used LSD',
       'Has used hard drugs', 'healthy_eating', 'regular_activity']
condom_use = df['Condom use']

**Dummy vars**

Made dummy varibales in order run knn

In [15]:
features_0

['YEAR',
 'raceeth',
 'How old are you',
 'What is your sex',
 'In what grade are you',
 'Seat belt use',
 'Riding with a drinking driver',
 'Drinking and driving',
 'Weapon carrying',
 'Weapon carrying at school',
 'Safety concerns at school',
 'Threatened at school',
 'Physical fighting',
 'Physical fighting at school',
 'Forced sexual intercourse',
 'Bullying at school',
 'Electronic bullying',
 'Sad or hopeless',
 'Considered suicide',
 'Made a suicide plan',
 'Attempted suicide',
 'Injurious suicide attempt',
 'Ever cigarette use',
 'Initiation of cigarette smoking',
 'Current cigarette use',
 'Current smokeless tobacco use',
 'Current cigar use',
 'Initiation of alcohol use',
 'Current alcohol use',
 'Source of alcohol',
 'Ever marijuana use',
 'Initiation of marijuana use',
 'Current marijuana use',
 'Ever steroid use',
 'Illegal injected drug use',
 'Illegal drugs at school',
 'Ever sexual intercourse',
 'Sex before 13 years',
 'Multiple sex partners',
 'Current sexual activity

In [16]:
#Turn all features into object types
features_0 = df[features_0].astype('object')

In [None]:
dummies_0 = pd.get_dummies(features_0,drop_first=False)

**Let's take a look to see if there is a class imbalance amongst our two seperate target variables**

In [9]:
print('Raw counts: \n')
print(condom_use.value_counts())
print('-----------------------------------')
print('Normalized counts: \n')
print(condom_use.value_counts(normalize=True))

Raw counts: 

1.0    30252
2.0    14095
3.0     9201
Name: Condom use, dtype: int64
-----------------------------------
Normalized counts: 

1.0    0.564951
2.0    0.263222
3.0    0.171827
Name: Condom use, dtype: float64


**Using SMOTE for class imbalance**

In [None]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(dummies_0, condom_use, test_size=0.25, random_state=1)
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train_0, y_train_0) 

In [None]:
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts())

**Running KNN model**

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, p = 2)

In [None]:
knn.fit(dummies_0,condom_use)

In [None]:
scaler = StandardScaler()  
scaler.fit(X_train_0)

X_train_0 = scaler.transform(X_train_0)  
X_test_0 = scaler.transform(X_test_0)  


model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train_0, y_train_0)

y_predict_test = model.predict(X_test_0)
y_predict_train = model.predict(X_train_0)

In [None]:
print('F1 score test: ', f1_score(y_test_0, y_predict_test, average='weighted'))
print('F1 score train: ',f1_score(y_train_0, y_predict_train, average='weighted'))

In [None]:
print('Accuracy test:', metrics.accuracy_score(y_test_0, y_predict_test))
print('Accuracy train:', metrics.accuracy_score(y_train_0, y_predict_train))

In [None]:
cm = confusion_matrix(y_test_0, y_predict_test)
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix: KNN')
plt.show()

**Discussing Results**

Given that our training set F1 and Accuracy scores were .9-.10 points higher than our testing set, we felt that the model was overfit to the data when using KNN. Because of this we would not rely on the KNN model to predict the likelihood of safe sex practices.

**Decision Tree**

In [None]:
features_0 = ['YEAR', 'raceeth', 'How old are you', 'What is your sex',
       'In what grade are you','Seat belt use', 'Riding with a drinking driver',
       'Drinking and driving', 'Weapon carrying', 'Weapon carrying at school',
       'Safety concerns at school', 'Threatened at school',
       'Physical fighting', 'Physical fighting at school',
       'Forced sexual intercourse', 'Bullying at school',
       'Electronic bullying', 'Sad or hopeless', 'Considered suicide',
       'Made a suicide plan', 'Attempted suicide', 'Injurious suicide attempt',
       'Ever cigarette use', 'Initiation of cigarette smoking',
       'Current cigarette use', 'Current smokeless tobacco use',
       'Current cigar use', 'Initiation of alcohol use', 'Current alcohol use',
       'Source of alcohol', 'Ever marijuana use',
       'Initiation of marijuana use', 'Current marijuana use',
       'Ever steroid use', 'Illegal injected drug use',
       'Illegal drugs at school', 'Ever sexual intercourse',
       'Sex before 13 years', 'Multiple sex partners',
       'Current sexual activity', 'Alcohol/drugs and sex', 
        'Perception of weight', 'Weight loss',
       'Television watching', 'Computer use', 'HIV testing', 'Asthma', 'Sleep',
       'Ever used LSD',
       'Has used hard drugs', 'healthy_eating', 'regular_activity']
condom_use = df['Condom use']

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(df[features_0], condom_use, test_size=0.25, random_state=1)
smote = SMOTE()
X_train_2, y_train_2 = smote.fit_sample(X_train_2, y_train_2) 

In [None]:
dtc = DecisionTreeClassifier(max_depth=5, class_weight='balanced')
dtc.fit(X_train_2, y_train_2)
dtc_preds_test  = dtc.predict(X_test_2)
dtc_preds_train  = dtc.predict(X_train_2)
dtc_f1_test = metrics.f1_score(y_test_2, dtc_preds_test,average = 'weighted')
dtc_f1_train = metrics.f1_score(y_train_2, dtc_preds_train,average = 'weighted')
print('F1 score test: ',dtc_f1_test)
print('F1 score train: ',dtc_f1_train)

In [None]:
print('Accuracy test:', metrics.accuracy_score(y_test_2, dtc_preds_test))
print('Accuracy train:', metrics.accuracy_score(y_train_2, dtc_preds_train))

In [None]:
cm = confusion_matrix(y_test_2, dtc_preds_test)
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix: Decision Tree Classifier')
plt.show()

**Discussing Results**

Given that our testing set F1 and Accuracy scores were .8-.9 points higher than our training set, we felt that the model wasn't overfit to the data when using Decision Trees. This model was better at predicting our target. Because of this we felt that this model could be used in determining the likelihood of someone using safe sex practices.

**Grid search**

In [None]:
Optimal Parameters: {'criterion': 'entropy', 'max_depth': 8, 'max_features': None, 'n_estimators': 100}
Best Model: RandomForestClassifier(criterion='entropy', max_depth=8, max_features=None,
                       random_state=42)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
rf_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 6, 8, 10],
    "max_features": [None,4,6,10],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf" : [1, 2, 3, 5, 6],
    "n_estimators" : [100, 250]
}
rf_grid = RandomForestClassifier(random_state=42)
gridsearch = GridSearchCV(rf_grid, rf_param_grid, cv=2, return_train_score=True, n_jobs=-1, verbose=-1)
gridsearch.fit(X_train_0, y_train_0)
print("Training Accuracy: {:.4}%".format(gridsearch.best_score_ * 100))
print("")
print("Optimal Parameters: {}".format(gridsearch.best_params_))
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print("Best Model: {}".format(gridsearch.best_estimator_))
gridsearch.score(X_test_0, y_test_0)

**Discussing Results**

Our grid search model was able to provide us the best results. The F1 and Acurracy scores for the training and testing set were very close, suggesting that the model is not overfit to the data. Because of this we believe this model has the highest predicitive power and will be our final model.