In [2]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd 
import graphviz 

# [Random Forest](https://www.datascience.com/resources/notebooks/random-forest-intro)

### What is it?
It is a non-parametric classification / regression model, which combines a set of other models - desicion trees.

### When do I use it?
I want to predict / classify a response variable based on arbitrary data. Be it countinous (regression problem) or categorical (classification problem) response. I care more about prediction performance then model explicability / model simplicity.

### Why should I use it?
Its simple to setup. Its reasonably good.

### Why should I NOT use it?
Its not a state of the art ML model. 

# How does it work?

In previous lecture we have shown how Decision Trees [DS] work. The downside of DS is that they tend to overfit. We can fix this problem by trying different leaf node sizes and optimize this parameter. Other (and better) way is to train multiple DTs each on just a sample of training data and make the decisions based on averaging all decisions from the group of DTs. This group of DTs is called Random Forest [RF].

*Note: The word Random in RF means, that the data for each DT in the forest is chosen randomly (columns and rows).*

----------------------------------------------
### Problem 1 - Math students' score regression
----------------------------------------------

## Dataset - FJFI Mathematical Analysis students' results table

We have taken data for 2 years in http://www.krbalek.cz/for_students/Files_to_load/Zapocty_MAB3_1718.pdf table. Our goal will be to predict the total points score (total_points variable) for a student based on his performance during the year before the second test.

## Load the data
At first we load the data into a Pandas DataFrame.

In [3]:
filepath = 'Data/Zapocty_MAB3.csv'
data_students = pd.read_csv(filepath, sep = ';')
data_students.head(10)

Unnamed: 0,name,departement,tutor,activity_class,activity_training,tests_training,test1,test2,total_points,creddited,Unnamed: 10
0,Babjak Daniel,EJCF,Strachota,,1.0,11.0,7.5,3.0,22.5,ne,
1,Bár Filip,AMSM,Hobza,,1.0,9.0,29.5,23.5,63.0,ano,2.0
2,Barborková Lucie,DAIZ,Hobza,,1.0,14.0,9.0,26.0,50.0,ano,
3,Bezányi Emanuel,EJCF,Strachota,,1.0,4.5,7.0,0.0,12.5,ne,
4,Bílý Vít,FTTF,Strachota,,1.0,11.5,17.5,27.5,57.5,ano,1.0
5,Bobek Josef,EJCF,Strachota,2.0,2.0,14.0,18.0,14.0,50.0,ano,
6,Bradshaw Petr,FE,Strachota,4.0,1.0,19.0,16.0,34.0,74.0,ano,4.0
7,Brisučiak Tomáš,EJCF,Strachota,,1.0,12.0,24.5,26.0,63.5,ano,2.0
8,Čada Jan,JIB,Kostková,,1.0,12.5,19.0,27.5,60.0,ano,2.0
9,Ďatková Michaela,EJCF,Strachota,,0.0,5.0,5.5,0.0,10.5,ne,


## Clean the data
We need to replace the missing (NaN) values with resonable data. In our case it is 0 since we are tolking about scored points of a student. Also we will get rid of last column because be don't know what data is carries and columns test2, creddited because we want to predict total score before this happens.

In [4]:
data_students.columns
data_students.fillna(0, inplace = True)
#data_students.drop(['Unnamed: 10', 'test2', 'creddited', 'name', 'departement', 'tutor'], axis = 1, inplace = True)
data_students.drop(['Unnamed: 10', 'test2', 'creddited', 'name'], axis = 1, inplace = True)
data_students.head(10)

Unnamed: 0,departement,tutor,activity_class,activity_training,tests_training,test1,total_points
0,EJCF,Strachota,0.0,1.0,11.0,7.5,22.5
1,AMSM,Hobza,0.0,1.0,9.0,29.5,63.0
2,DAIZ,Hobza,0.0,1.0,14.0,9.0,50.0
3,EJCF,Strachota,0.0,1.0,4.5,7.0,12.5
4,FTTF,Strachota,0.0,1.0,11.5,17.5,57.5
5,EJCF,Strachota,2.0,2.0,14.0,18.0,50.0
6,FE,Strachota,4.0,1.0,19.0,16.0,74.0
7,EJCF,Strachota,0.0,1.0,12.0,24.5,63.5
8,JIB,Kostková,0.0,1.0,12.5,19.0,60.0
9,EJCF,Strachota,0.0,0.0,5.0,5.5,10.5


In [5]:
# Include dummies - you can add the dummies for categorical data to the model, but the dataset is quite small, so doing so can lead to overfitting.
data_students = pd.get_dummies(data_students)
data_students.head(10)

Unnamed: 0,activity_class,activity_training,tests_training,test1,total_points,departement_AMSM,departement_ASI,departement_DAIZ,departement_DM,departement_EJCF,...,departement_FTTF,departement_IPL,departement_JIB,departement_MIAMSM,departement_MM,tutor_0,tutor_Hobza,tutor_Kollert,tutor_Kostková,tutor_Strachota
0,0.0,1.0,11.0,7.5,22.5,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,0.0,1.0,9.0,29.5,63.0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.0,1.0,14.0,9.0,50.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.0,1.0,4.5,7.0,12.5,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0.0,1.0,11.5,17.5,57.5,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
5,2.0,2.0,14.0,18.0,50.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
6,4.0,1.0,19.0,16.0,74.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0.0,1.0,12.0,24.5,63.5,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
8,0.0,1.0,12.5,19.0,60.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
9,0.0,0.0,5.0,5.5,10.5,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


## Train - Test Dataset split
In order to test how our model performes, we need to split the data into trainin and testing part. Usualy this means just to randomly pick 1/4 of data and label it as a test dataset.

In [6]:
X = data_students.drop('total_points', axis = 1)
y = data_students['total_points']

train_X, val_X, train_y, val_y = train_test_split(X, y)

## Model build
Now we set the model.

In [7]:
# create DT model with minimum of 4 smples in each leaf
model_tree = DecisionTreeRegressor(min_samples_leaf = 4)
model_tree.fit(train_X, train_y)

# lets look at the graph of the decision tree
dot_data = export_graphviz(model_tree, out_file=None, feature_names=list(X.columns)) 
graph = graphviz.Source(dot_data)
graph.render("Zapocty_MAB3_tree")

# VALIDATE THE MODEL
val_predictions = model_tree.predict(val_X)
MAE_tree = mean_squared_error(val_y, val_predictions)
print("Decision Decision Tree model error is {}".format(MAE_tree))

Decision Decision Tree model error is 124.61661993964253


## Model validation
In order to state how our model performes, we just use Mean Square Error of predictions on our test set and predicted values. 

In [8]:
n_runs = 100
MAE_tree_list = []
for i in range(n_runs):
    X = data_students.drop('total_points', axis = 1)
    y = data_students['total_points']

    train_X, val_X, train_y, val_y = train_test_split(X, y)
    
    # create DT model with minimum of 4 smples in each leaf
    model_tree = DecisionTreeRegressor(min_samples_leaf = 4)
    model_tree.fit(train_X, train_y)

    # VALIDATE THE MODEL
    val_predictions = model_tree.predict(val_X)
    MAE_tree = mean_squared_error(val_y, val_predictions)
    #print("Decision Decision Tree model error is {}".format(MAE_tree))
    MAE_tree_list.append(MAE_tree)
MAE_tree_mean = np.mean(MAE_tree_list)
print("Decision Decision Tree model squared error from {} runs is {}".format(n_runs, MAE_tree_mean))

Decision Decision Tree model squared error from 100 runs is 158.91065195963802


## Lets try the same Random Forest model

In [9]:
# create RF model with minimum of 4 smples in each leaf of a underliing trees
model_forest = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 4)
model_forest.fit(train_X, train_y)

# VALIDATE THE MODEL
val_predictions = model_forest.predict(val_X)
MAE_forest = mean_squared_error(val_y, val_predictions)
print("Decision Random Forest model error is {}".format(MAE_forest))

Decision Random Forest model error is 84.89135113821109


In [10]:
n_runs = 100
MAE_forest_list = []
for i in range(n_runs):
    X = data_students.drop('total_points', axis = 1)
    y = data_students['total_points']

    train_X, val_X, train_y, val_y = train_test_split(X, y)
    
    # create DT model with minimum of 4 smples in each leaf
    model_forest = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 4)
    model_forest.fit(train_X, train_y)

    # VALIDATE THE MODEL
    val_predictions = model_forest.predict(val_X)
    MAE_forest = mean_squared_error(val_y, val_predictions)
    MAE_forest_list.append(MAE_forest)
MAE_forest_mean = np.mean(MAE_forest_list)    
print("Random Forest model squared error from {} runs is {}".format(n_runs, MAE_forest_mean))

Random Forest model squared error from 100 runs is 112.97665415228944


In [15]:
# last model feature importance
feature_importances = pd.DataFrame(model_forest.feature_importances_, index = train_X.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
tests_training,0.523931
test1,0.454057
tutor_Strachota,0.011504
activity_training,0.006321
departement_EJCF,0.001539
activity_class,0.001528
tutor_Kostková,0.000568
departement_FE,0.000252
departement_FTTF,0.000163
tutor_Kollert,7.9e-05


----------------------------------------------
### Problem 2 - Analytical function regression
----------------------------------------------

In [446]:
# ANALYTICAL FUNCTION REGRESSION PROBLEM
# create X data for the regression
X = np.arange(0, 10, 0.1)
# add some salt to simulate noise
salt0 = [6*(np.random.rand() > .4)*(-1)**(np.random.rand() > .5) for x in X]
salt1 = [30*(np.random.rand() > .9)*(-1)**(np.random.rand() > .5) for x in X]
salt2 = [75*(np.random.rand() > .95)*(-1)**(np.random.rand() > .5) for x in X]
salt_outlier = [200*(np.random.rand() > .98)*(-1)**(np.random.rand() > .5) for x in X]
Y_pure = [50*np.sin(x)+x*15 for x in X]
# create Y data for the regression
Y = np.sum([Y_pure, salt0, salt1, salt2, salt_outlier], axis = 0)

# plot the [X, Y] data
iplot([go.Scatter(x=X, y=Y, mode = 'markers')])

In [447]:
# set the parameter for minimal number of data points in leaf nodes
n_leafs = 4

# initialize the model object instance
clf = DecisionTreeRegressor(min_samples_leaf = n_leafs)
# fit the model
clf = clf.fit(X.reshape(-1,1), Y)
# make the prediction for the data on which the model was fitted
Y_hat = clf.predict(X.reshape(-1,1))

# FORESTS MODEL
# initialize the model object instance
clf_forest = RandomForestRegressor(n_estimators = 1000, min_samples_leaf = n_leafs)
# fit the model
clf_forest = clf_forest.fit(X.reshape(-1,1), Y)
# make the prediction for the data on which the model was fitted
Y_hat_forest = clf_forest.predict(X.reshape(-1,1))

# plot the data & fitted values with plotly
trace0 = go.Scatter(x=X, y=Y, mode = 'markers', name = 'Original data')
trace1 = go.Scatter(x=X, y=Y_hat, name = 'Prediction_Tree')
trace2 = go.Scatter(x=X, y=Y_hat_forest, name = 'Prediction_Forest')
data = [trace0, trace1, trace2]
iplot(data)

## Sources
[Scikit documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)

[Wiki](https://en.wikipedia.org/wiki/Random_forest)

[Article1](https://towardsdatascience.com/the-random-forest-algorithm-d457d499ffcd)

[Article2](https://towardsdatascience.com/random-forest-in-python-24d0893d51c0)

[Article3](https://www.datascience.com/resources/notebooks/random-forest-intro)
