# DSO106 MachineLearn L3 - decisionTree randomForest

## Decision Trees and Random Forests

## Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Load in data

In [None]:
iris = sns.load_dataset('iris')

# Decision Trees In Python

## Data Wrangling

In [None]:
x = iris.drop('species', axis=1)
y = iris['species']

## Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=76)

## Create the Initial Decision Tree

In [None]:
decisionTree = DecisionTreeClassifier(random_state=76)
decisionTree.fit(x_train, y_train)

### Assess the Model

In [None]:
treePredictions = decisionTree.predict(x_test)

In [None]:
print(confusion_matrix(y_test, treePredictions))

### Seeing how well the model fits the data

In [None]:
print(classification_report(y_test, treePredictions))

# Random Forests in Python

In [None]:
forest = RandomForestClassifier(n_estimators=500, random_state=76)
forest.fit(x_train, y_train)

# Code for the Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Now it will work

In [None]:
forest = RandomForestClassifier(n_estimators=500, random_state=76)
forest.fit(x_train, y_train)

# Next is to create predictions and run a report on the model

In [None]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

# Model is 96% percent accurate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# #Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators_array = [1, 4, 5, 8, 10, 20, 50, 75, 100, 250, 500]
results = []
for n in n_estimators_array:
    forest = RandomForestClassifier(n_estimators=n, random_state=76)
    forest.fit(x_train, y_train)
    result = accuracy_score(y_test, forest.predict(x_test))
    results.append(result) 
    print(n, ':', result)

## The highest accuracy occurs at 10 Tree Random Forest

In [None]:
plt.plot(n_estimators_array, results)

# Tuning remaining 3

In [None]:
# Number of features to consider at every split
max_features = ['auto', None, 'log2']
# Maximum number of levels in tree
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, None]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

In [None]:
rf = RandomForestClassifier(n_estimators=10)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 90, cv = 3, random_state=42)

In [None]:
rf_random.fit(x_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
forest = RandomForestClassifier(n_estimators=10, min_samples_leaf=4, max_features="auto", max_depth=30)
forest.fit(x_train, y_train)

In [None]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

# Feature Importance in Python

In [None]:
feature_importances = pd.Series(forest.feature_importances_, index=x.columns)
feature_importances

# Print in DESC order

In [None]:
feature_importances.sort_values(inplace=True, ascending=False)
print(feature_importances)

In [None]:
feature_importances.plot(kind='barh', figsize=(7,6))

# Accessing an API

In [None]:
pip install quandl

In [None]:
import quandl

In [None]:
Alaska = quandl.get("FMAC/HPI_AK")
Alaska.head()

## API allows Python to access data from a website