In [1]:
# Import our libraries 

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np
# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score


# Helper function for hyper-parameter turning.
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import plot_tree, export_text 
# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Use inline so our visualizations display in notebook
%matplotlib inline


## Main Steps when building a Machine Learning Model. 
1. Inspect and explore data.
2. Select and engineer features.
3. Build and train model.
4. Evaluate model.

# #1 Inspect and explore data.
* Load titanic data
* Visualize all the data using sns.pairplot
* Check for null values

In [None]:
# Load in the titanic data set.
df= pd.read_csv('data/titanic.csv')
df.head()

In [None]:
# Visualize all the data using sns.pairplot
sns.pairplot(df, hue= 'survived')

In [None]:
# Check for null values
df.isnull().sum()

# #2 Select and engineer features.
1. Fill age null values with -999
1. Convert to numerical values if need be by using `pd.get_dummies()`
1. Create a list of the features you are going to use.  In this case use as many or as little as you would like.
1. Define our `X` and `y`
1. Split our data into trainig and testing sets.

In [None]:
# Fill age null values with -999
df['age'] = df['age'].fillna(-999)

In [None]:
# 1. Convert to numerical values if need be by using `pd.get_dummies()`
df = pd.get_dummies(df, columns = ['sex', 'pclass','embarked'], drop_first = True)

In [None]:
# 2. Create a list of the features we are going to use.
feature= ['fare', 'age', 'sex_male', 'fare', 'pclass_2', 'pclass_3']


In [None]:
# Define our `X` and `y`
X = df[feature]
y = df['survived']


In [None]:
# Split our data into trainig and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=45)
print('Lenght of our Training data:', X_train.shape[0], '\nLength of our Testing data:', X_test.shape[0])

# #3 Build and train model.
1. For our first pass, initialize our model with `max_depth=2`.
2. Fit our model with our training data. 
3. Make predictions of our testing data. 
4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
    * To calculate auc score you have to get the predicted probabilites for the Survived class using `model.predict_proba(X_test)[:,1]`
5. Visualize our Decision Tree using provided code. 


In [None]:
# For our first pass, initialize our model with `max_depth=2`.

model = DecisionTreeClassifier(max_depth = 2)

In [None]:
# Fit our model with our training data. 
model.fit(X_train, y_train)


In [None]:
# Make predictions of our testing data. 
y_pred = model.predict(X_test)


In [None]:
# 4. Evaluate and print our model scores using accuracy, precision, recall, f1 scores, and auc scores. 
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true = y_test, y_pred = y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true = y_test,  y_pred = y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true = y_test,  y_pred = y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities
y_pred_proba = model.predict_proba(X_test)

# Keep only the proba for True
y_pred_proba = y_pred_proba[:,1]

# Compute auc score
auc = roc_auc_score(y_true = y_test, y_score = y_pred_proba)
print('AUC Score: %f' % auc)

# Visualize your tree

In [None]:
class_names= ['died', 'survived']
plot_tree(model, feature_names = feature, class_names = class_names, filled= True)

# Picking the right parameters...

# Parameter tuning of your Decision Tree using GridSearch or RandomizedSearch

### For assistance on this, look at Steves TA Tips code in `TA-Tips/random_forest_tuning.ipynb`


1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Visualize your best tree.
1. Which feature was your most important feature?

```python
tree.DecisionTreeClassifier(
    *,
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,
)
```


[Tips on how to customize / set the paramters in the decision tree.](https://scikit-learn.org/stable/modules/tree.html#tips-on-practical-use)

In [None]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search.from sklearn.model_selection import GridSearchCV
params = { 
    'criterion':['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'splitter' : ['best'],
    'max_features': ['sqrt', 'log2', None], 
    }

In [None]:
# 1. Initalize your GridSearchCV with a DecisionTreeClassifier, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
grid_search_cv =  GridSearchCV(model, param_grid=params, scoring = 'accuracy')


In [None]:
# 1. Fit your GridSearchCV with your training data. 
grid_search_cv.fit(X_train, y_train)

In [None]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print( grid_search_cv.best_estimator_)

In [None]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you the best tree
model = grid_search_cv.best_estimator_

# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_true=y_test,y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test,y_pred=y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test,y_pred=y_pred)
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test,y_pred=y_pred)
print('F1 Score: %f' % f1)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_true = y_test, y_score = y_pred_proba)
print('AUC Score: %f' % auc)

In [None]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp_titanic = pd.DataFrame.from_dict( {'feature_importance': model.feature_importances_,
                                       'feature':feature}).sort_values('feature_importance', ascending=False)
feature_imp_titanic

# Now onto Random Forests...
Were going to do the same with, but this time with a random forest. Remeber... Repetition is the father of learning.

1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
1. Initalize your GridSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
1. Fit your GridSearchCV with your training data. 
1. Print the parameters of your best model. 
1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 
1. Which feature was your most important feature?


# Parameters of the Random Forest Classifier

```python
RandomForestClassifier(
    n_estimators=100,
    *,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)
```

In [None]:
# 1. Make a dictionary of at least 3 parameters and a list of 3 values for each for your grid search. 
params_rf = {
    'criterion':['gini', 'entropy'],
    'max_depth': [2, 10, 20],
    'max_features': ['sqrt', 'log2', None], 
}

In [None]:
# 1. Initalize your GridSearchCV or RandomizedSearchCV with a RandomForestClassifer, your param_grid, and what you are optimizing for.  Choose any of the five optimization strategies; accuracy, precision, recall, f1, or roc_auc.
random_forest =  GridSearchCV(RandomForestClassifier(),param_grid=params_rf, scoring='accuracy', cv=5)




In [None]:
# 1. Fit your GridSearchCV with your training data. 
random_forest.fit(X_train, y_train) 


In [None]:
# 1. Print the parameters of your best model. 
# Print the best parameters it found
print(random_forest.best_estimator_) 





In [None]:
# 1. Evaluate your best model using accuracy, precision, recall, f1 scores, and auc scores. 

# This command gives you tree that has the highest f1-score. 
model = random_forest.best_estimator_


# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy_rf = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy_rf)

precision_rf = precision_score(y_true=y_test, y_pred=y_pred)
print("Precision Score: %f" % precision_rf)

recall_rf = recall_score(y_true=y_test, y_pred=y_pred)
print("Recall Score: %f" % recall_rf)

f1_rf = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score: %f' % f1_rf)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba_rf = model.predict_proba(X_test)[:,1]

auc_rf = roc_auc_score(y_true=y_test, y_score=y_pred_proba_rf)
print('AUC Score: %f' % auc_rf)

In [None]:
# 1. Which feature was your most important feature?
# Now lets look at our feature importances
feature_imp = pd.Series(model.feature_importances_,index=feature).sort_values(ascending=False)
feature_imp

# Build a random forest using the ny-vs-sf-housing.csv data. 
* Your target variable, aka the column you are trying to predict, aka your `y` variable is `in_sf`. 
* Can you get an accuracy above %88.8889?
* What was your most important feature?


In [2]:
df = pd.read_csv('data/ny-vs-sf-houses.csv')
df.head()

Unnamed: 0,in_sf,beds,bath,price,year_built,sqft,price_per_sqft,elevation
0,0,2.0,1.0,999000,1960,1000,999,10
1,0,2.0,2.0,2750000,2006,1418,1939,0
2,0,2.0,2.0,1350000,1900,2150,628,9
3,0,1.0,1.0,629000,1903,500,1258,9
4,0,0.0,1.0,439000,1930,500,878,10


In [3]:
# BUILD, TRAIN, AND EVAULATE A RANDOM FOREST MODEL BELOW. 
house_features = ['year_built','sqft','price_per_sqft','elevation']
X = df[house_features]
y = df['in_sf']
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.20,random_state=45)
print('Lenght of our Training data:', X_train.shape[0], '\nLength of our Testing data:', X_test.shape[0])

Lenght of our Training data: 393 
Length of our Testing data: 99


In [4]:

params_rf_house = {
    #'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
}


In [None]:
random_forest = GridSearchCV(RandomForestClassifier(),param_grid=params_rf_house)
random_forest.fit(X_train,y_train)

In [None]:
print(random_forest.best_estimator_)
model = random_forest.best_estimator_


In [None]:
# Now lets evaluate our model
y_pred = model.predict(X_test)

accuracy_rf_house = accuracy_score(y_test, y_pred)
print("Accuracy Score: %f" % accuracy_rf_house)

precision_rf_house = precision_score(y_test, y_pred)
print("Precision Score: %f" % precision_rf_house)

recall_rf_house = recall_score(y_test, y_pred)
print("Recall Score: %f" % recall_rf_house)

f1_rf_house = f1_score(y_test, y_pred)
print('F1 Score: %f' % f1_rf_house)

# Calculate predicted probabilities, keep only probability for when class = 1
y_pred_proba_rf_house = model.predict_proba(X_test)[:,1]

auc_rf_house = roc_auc_score(y_true=y_test, y_score=y_pred_proba_rf_house)
print('AUC Score: %f' % auc_rf_house)

# Awesome difficult extra credit below:
Build a classifier using the adult_income.csv data.  
* The target variable is 'class'
* Start with just using these features `selected_features = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']`
* You have to include the pos_label in your precision, recall, and f1 scores. It just tells the classifier which one is the posotive label.  I provided the proper way below.

* See if you can get above 50% f1 score.  
* See some [super tricks and tips here](https://www.kaggle.com/code/jieyima/income-classification-model)

In [None]:
df = pd.read_csv('data/adult_income.csv')
df.head()