# Validation
### Checking the relevance of Feature Engineering and correlation to remove unecessary variables and improve the accuracy

The aim is to create a model to predict next headway with:
* all variables
* selected variables

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# Choose the city of analysis

city = 'Recife'
# city = 'CG'
# city = 'Curitiba'

In [None]:
path = os.getcwd() + "/../data/output/" + city + "/"

# All variables
all_data = "new_feats_data.csv"
integrated_data_path = path + all_data

# Selected variables
selected_data = "selected_data.csv"
selected_data_path = path + selected_data

In [None]:
# Reading data

df_all_data = pd.read_csv(integrated_data_path)
#df_selected_data = pd.read_csv(selected_data_path)

### Converting categorical variables

In [None]:
df_all_data = pd.get_dummies(df_all_data)

### Training prediction model

In [None]:
y_train_all = df_all_data.drop(['headway'])
X_train_all = df_all_data

#### Modelo RF (Random Forest)
Random Forest é um algoritmo que ajusta várias árvores de decisão de classificação em várias subamostras do conjunto de dados e usa a média para melhorar a precisão preditiva e controlar o ajuste excessivo. A quantidade de árvores (n_estimators) é um parâmetro, cujo valor, deve ser encontrado, para que o modelo melhor represente os dados.

In [None]:
# function to train the model and to calculate the RMSE with cross validation
def rmse_cv(model, X_train, y_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = "neg_mean_squared_error", cv = 10))
    return(rmse)

# function to print the coefficients of the model
def print_coefficients(model):
    w = list(model.coef_)
    # Parameters must be in reverse order because the poly function 
    w.reverse()
    print (np.poly1d(w) + model.intercept_)

# function to train the model and to return the score
def train_model(model, X_train, y_train, print_coef):
    model.fit(X_train, y_train)
    if (print_coef):
        print_coefficients(model)
    
    score = model.score(X_validation, y_validation)
    return score

# function to plot the RMSE vs parameter value
def plot_rmse_param(series, param_name):
    series.plot(title = "Validation Error vs " + param_name)
    plt.xlabel(param_name)
    plt.ylabel("RMSE")
    
# function to get the best RMSE and the best parameter value of the model
def best_rmse_param(series):
    best_rmse = series.min()
    best_param = series.idxmin() 
    
    return(best_rmse, best_param)

In [None]:
n_estimators = [1, 10, 50, 100, 1000]
cv_rf_rmse = [rmse_cv(RandomForestRegressor(n_estimators = n), X_train_all, y_train_all).mean() 
            for n in n_estimators]

series = pd.Series(cv_rf_rmse, index = n_estimators)
plot_rmse_param(series, "n_estimators")

In [None]:
best_rmse_rf, best_param_rf = best_rmse_param(series)

model_rf = RandomForestRegressor(n_estimators = best_param_rf)
score_rf = train_model(model_rf, X_train_all, y_train_all, False)

print u"Modelo RF, com os dados de 2006 e 2010: \nEstimators = {0} \nRMSE = {1} \nScore = {2}".format(best_param_rf, best_rmse_rf, score_rf)