# Chapter 1

### Hyperparameter Tuning

- Finding optimal combination of parameters for a model
- parameters: 
    - the ones that is set by the model after learning from dataset 
    - eg: co-efficients of linear regression, node decision by the decision trees
    - accessible by attribute (in the attribute section in the documentation)
- hyperparameters : 
    - the ones that we have the option to set before creating the model
    - print the estimator to see what it contains
    - accessible by parameter (in the parameter section in the documentation)
- Silly things to do (some examples):
    - Creating a random forest with just 2 or 3 trees
    - 1,2 neighbors in knn algorithm
    - increasing a hyperparameter by a small amount
    - Be aware of conflicting hyperparameter choices (The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties.)
- Visualize if the hyperparameter has any effect:
    - Graph of learning curve : hyperparameter on X-axis and accuracy on Y-axis
- Problem: So many models can be build. But among these, find an optimal model that yields optimal result.
- Solution: Train with a set of adjustable parameters and compare the results to find the optimal model
- Rule of thumb : Cross validation is used to estimate the generalization performance.
- Curse of dimensionality : exhaustively searching results in increase of dimensions with the increase of grid.
- Best practice : Do this when you really need optimal solution since it does not make a bad model into a good model.
- optimal hyperparameters = set of hyperparameters corresponding to the best CV score.
- Some algorithms:
    - Grid Search
    - Random Search
    - Bayesian Optimization
    - Genetic Algorithms

```
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score, mean_absolute_error, make_scorer

# Split data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 42)
# Instantiate individual classifiers
lr = LogisticRegression(random_state=42)
knn = KNN()
dt = DecisionTreeClassifier(random_state=42,max_depth=4, min_samples_leaf=0.16)
classifiers = [('Logistic Regression', lr),
                ('K Nearest Neighbours', knn),
                ('Classification Tree', dt)]

# Instantiate an ensemble VotingClassifier
from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(estimators=classifiers)

# Instantiate an ensemble VotingRegressor
ensemble_model = VotingRegressor(estimators=regressors)

# Instantiate an ensemble BaggingClassifier
from sklearn.ensemble import BaggingClassifier
ensemble_model = BaggingClassifier(base_estimator=dt, n_estimators=300,oob_score=True, n_jobs=-1)
oob_accuracy = bc.oob_score_

# Instantiate an ensemble BaggingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
base_regressor = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13, random_state=3)
ensemble_model = BaggingRegressor(base_estimator=base_regressor, n_estimators=300, oob_score=True, n_jobs=-1)
oob_score = ensemble_model.oob_score_

# Instantiate an ensemble RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
ensemble_model = RandomForestRegressor(n_estimators=400, min_samples_leaf=0.12, random_state=42)

# Instantiate an ensemble RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
ensemble_model = RandomForestClassifier(n_estimators=400, random_state=42)

# Instantiate an ensemble AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
ensemble_model = AdaBoostClassifier(base_estimator=dt, n_estimators=100) # dt is weak, has max depth of 1
y_pred_proba = ensemble_model.predict_proba(X_test)[:,1]
# Evaluate testing roc_auc_score
from sklearn.metrics import roc_auc_score
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

# Instantiate an ensemble GradientBoostingRegressor, (max_features=0.2, subsample=0.8) makes it stochastic gradient boosting
from sklearn.ensemble import GradientBoostingRegressor
ensemble_model = GradientBoostingRegressor(max_depth=1, subsample=0.8, max_features=0.2, n_estimators=300, random_state=42)

# Train using traing set
ensemble_model.fit(X_train, y_train)
# Predict with test set
y_pred = ensemble_model.predict(X_test)
# Evaluate accuracy for classification
print(accuracy_score(y_test, y_pred))
# Evaluate RMSE for regression
rmse = MSE(y_test, y_pred)**(1/2)
# Visualize features importances
importances = pd.Series(ensemble_model.feature_importances_, index = X.columns)
sorted_importances = importances.sort_values()
sorted_importances.plot(kind='barh', color='lightgreen')
plt.show()

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
# See what parameters can be tuned
ryour_dt_model.get_params()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
params_dt = {
    'max_depth': [3, 4,5, 6],
    'min_samples_leaf': [0.04, 0.06, 0.08],
    'max_features': [0.2, 0.4,0.6, 0.8]
}
mae_scorer = make_scorer(mean_absolute_error)
model_cv = GridSearchCV(estimator=your_dt_model,
    param_grid=params_dt,
    cv=kf, # scorer = mae_scorer
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1)
model_cv.fit(X_train, y_train)
best_hyperparams = model_cv.best_params_# Get the parameters with best result
best_model = model_cv.best_estimator_ # Get the best model
best_model.get_params() # See all parameters
y_pred = best_model.predict(X_test) # predict with best model
best_score = best_model.best_score_
model_cv.cv_results_ # See all information from dictionary
from sklearn.externals import joblib
joblib.dump(best_model, 'my_best_model.pkl') # Save the model in pkl file
```

### Decision Tree

```
# Split into train and test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size=0.3, random_state=3)

# Make sure to take into account the class imbalance 
from sklearn.utils.class_weight import compute_sample_weight
w_train = compute_sample_weight('balanced', y_train)

# Train the classifier
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
tree_clf.fit(X_Train,y_Train, sample_weight=w_train)

# Alternative approach : Train the classifier with snapml (offers multi-threaded CPU/GPU training)
from snapml import DecisionTreeClassifier
snapml_dt_gpu = DecisionTreeClassifier(max_depth=4, random_state=45, use_gpu=True)
snapml_dt_cpu = DecisionTreeClassifier(max_depth=4, random_state=45, n_jobs=4)
snapml_dt.fit(X_train, y_train, sample_weight=w_train)
# Predict
y_pred = tree_clf.predict(X_Test)

### Inspecting a random forest
# Pull out one tree from the forest (If decision tree is a random forest)
chosen_tree = randomforest_model.estimators_[7] # You can visualize it with (graphviz & pydotplus)
# Extract node decisions
split_column = chosen_tree.tree_.feature[0] # Get the first column it split on
split_column_name = X_train.columns[split_column] # Name of the column
split_value = chosen_tree.tree_.threshold[1] # Get the theshold value it split on

# Compute predicted probabilities
y_pred_prob = tree_clf.predict_proba(X_test)[:,1]

# Evaluate tree
from sklearn.metrics import roc_auc_score, accuracy_score
accuracy_score(y_testset, predTree)
roc_auc_score(y_test, y_pred)

# Visualize the graph using plot_tree
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(chosen_tree, feature_names=X_train.columns, filled=True, rounded=True, fontsize=10)
plt.show()
```