# Random forest

random forest regressor: multiple inputs - multiple outputs
- input
  - user: 'Orig', 'Dest', 'depDay', 'arrDay'
  - other attributes: 'con1', 'con2', 'depDay2', 'depDay3', 'elaptime', 'detour', 'paxe', 'cluster', 'TOT_pax', 'is_direct_flight', 'stops_0.0', 'stops_1.0', 'stops_2.0', 'segn_1.0', 'segn_2.0', 'segn_3.0', 'real_dist'
- output: 'market_share', 'dep_hour', 'dep_min', 'arr_hour', 'arr_min', 'con_time'

Don't think about whether it's direct or not at the moment! Go simple!

In [1]:
import pandas as pd
import numpy as np

### 1. load data

In [2]:
# read the preprocessed data
df = pd.read_csv('./dataset/dataprep_v2.csv').drop(columns=['Unnamed: 0'])
display(df.head())

Unnamed: 0,Orig,con1,con2,Dest,op_flight1,op_flight2,op_flight3,depDay,elaptime,detour,...,stops,paxe,cluster,TOT_pax,market_share,real_dist,total_time,connection_time,dep_hour,arr_hour
0,26,64,239,181,732.0,2624,1101,2.0,535.0,1.02676,...,1.0,0.0,111.0,0.0,0.0,3899.12,535.0,0.0,13,22
1,26,64,239,181,732.0,2624,1101,4.0,535.0,1.02676,...,1.0,0.0,111.0,0.0,0.0,3899.12,535.0,0.0,13,22
2,26,64,239,181,732.0,1748,1101,4.0,775.0,1.02676,...,1.0,0.0,111.0,0.0,0.0,3899.12,775.0,0.0,13,2
3,26,64,239,181,732.0,2624,1101,5.0,535.0,1.02676,...,1.0,0.0,111.0,0.0,0.0,3899.12,535.0,0.0,13,22
4,26,64,239,181,732.0,2624,1101,6.0,535.0,1.02676,...,1.0,0.0,111.0,0.0,0.0,3899.12,535.0,0.0,13,22


`A simpler model`

In [3]:
df.columns

Index(['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2',
       'op_flight3', 'depDay', 'elaptime', 'detour', 'arrDay', 'stops', 'paxe',
       'cluster', 'TOT_pax', 'market_share', 'real_dist', 'total_time',
       'connection_time', 'dep_hour', 'arr_hour'],
      dtype='object')

In [4]:
X = df.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2', 'op_flight3',
               'depDay', 'elaptime', 'detour', 'arrDay', 'stops', 'cluster', 'real_dist',
               'total_time', 'connection_time', 'dep_hour', 'arr_hour']]
X.head()

Unnamed: 0,Orig,con1,con2,Dest,op_flight1,op_flight2,op_flight3,depDay,elaptime,detour,arrDay,stops,cluster,real_dist,total_time,connection_time,dep_hour,arr_hour
0,26,64,239,181,732.0,2624,1101,2.0,535.0,1.02676,3.0,1.0,111.0,3899.12,535.0,0.0,13,22
1,26,64,239,181,732.0,2624,1101,4.0,535.0,1.02676,5.0,1.0,111.0,3899.12,535.0,0.0,13,22
2,26,64,239,181,732.0,1748,1101,4.0,775.0,1.02676,5.0,1.0,111.0,3899.12,775.0,0.0,13,2
3,26,64,239,181,732.0,2624,1101,5.0,535.0,1.02676,6.0,1.0,111.0,3899.12,535.0,0.0,13,22
4,26,64,239,181,732.0,2624,1101,6.0,535.0,1.02676,7.0,1.0,111.0,3899.12,535.0,0.0,13,22


In [5]:
# Didn't include in the simpler model: 'paxe', 'TOT_pax' 
y = df.loc[:, 'market_share']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: market_share, dtype: float64

1.2 Splitting the dataset into training/test set
https://builtin.com/data-science/random-forest-python<br/>
https://machinelearningmastery.com/random-forest-ensemble-in-python/

In [7]:
from sklearn.model_selection import train_test_split
# random_state: controls the shuffling applied to the data before applying the split.
# Pass an int for reproducible output across multiple function calls.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# check the size of the splitted dataset
print("Shape of X_train:", X_train.shape) # expected output (418244, 18)
print("Shape of y_train:", y_train.shape) # expected output (418244,)
print("Shape of X_test:", X_test.shape) # expected output (104562, 18)
print("Shape of y_test:", y_test.shape) # expected output (104562,)

Shape of X_train: (418244, 18)
Shape of y_train: (418244,)
Shape of X_val: (104562, 18)
Shape of y_val: (104562,)


In [8]:
# Standardisation
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

### 3. Creating a random forest regression model and fitting it to the training data

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Create a random forest regressor
forest = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model to the training data
forest.fit(X_train, y_train)

In [11]:
# Predict the target variable on the training data
y_train_pred = forest.predict(X_train)
# Predict the target variable on the validation data
y_test_pred = forest.predict(X_test)


# Evaluate Model
from sklearn.metrics import mean_squared_error

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

# Report the result
print('Random Forest')
print('--------------------------------------------------------------------------------\n')
print('MSE (Training) = %.4f' % mse_train)
print('MSE (Testing)  = %.4f' % mse_test)

Random Forest
--------------------------------------------------------------------------------

MSE (Training) = 0.0005
MSE (Testing)  = 0.0051


#### Hyperparameter tuning

In [12]:
from pprint import pprint
# Parameters in use for basic model above
print('Parameters currently in use:\n')
pprint(forest.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}


#### Random hyperparameter grid

In [13]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=20, stop=1000, num=10)] # didn't necessarily return the best result w/ high n_estimators
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
               #'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap}

pprint(random_grid)

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [20, 128, 237, 346, 455, 564, 673, 782, 891, 1000]}


#### Random search training

In [14]:
# activate GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  print('GPU device not found')

GPU device not found


In [15]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 3 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               n_iter=3, cv=3, verbose=2, random_state=0, n_jobs=-1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


  warn(


In [16]:
# View the best parameters from fitting the random search:
rf_random.best_params_

{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 60}

### 4. Validate the model

#### evaluate the random search

In [19]:
def evaluate(model, X_test, y_test):
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  return mse

# Base Model Performance
base_model = RandomForestRegressor(random_state=0, n_estimators=20)
base_model.fit(X_train, y_train)
base_mse = evaluate(base_model, X_test, y_test)

print('Model Performance')
print('--------------------------------------------------------------------------------\n')
print('MSE (Base model)  = %.4f' % base_mse)

Model Performance
--------------------------------------------------------------------------------

MSE (Base model)  = 0.0054


In [23]:
# Random Search Model performance
best_random = rf_random.best_estimator_
random_mse = evaluate(best_random, X_test, y_test)
print('Improvement of {:0.2f}%.'.format(100 * (base_mse - random_mse) / base_mse))

Improvement of 8.10%.


In [22]:
print(base_model.score(X_test, y_test))
print(best_random.score(X_test, y_test))

-0.07928579831927829
0.008127420721369627


### 5. Save model

### 6. Predict market share for the kaggle competition dataset

Load data

In [24]:
kaggle_data = pd.read_csv('./dataset/testset_kaggle_prep.csv').drop(columns='Unnamed: 0')
kaggle_data

Unnamed: 0,Orig,con1,con2,Dest,op_flight1,op_flight2,op_flight3,depDay,elaptime,detour,arrDay,stops,cluster,id,real_dist,total_time,connection_time,dep_hour,arr_hour
0,26,64,239,181,732,2315,827,2,535,1.02676,3,1,111,1,3899.12,535.0,0.0,13,22
1,26,64,239,181,732,2315,827,4,535,1.02676,5,1,111,2,3899.12,535.0,0.0,13,22
2,26,64,239,181,732,2315,827,5,535,1.02676,6,1,111,3,3899.12,535.0,0.0,13,22
3,26,64,239,181,732,1601,827,5,775,1.02676,6,1,111,4,3899.12,775.0,0.0,13,2
4,26,64,239,181,732,2315,827,6,535,1.02676,7,1,111,5,3899.12,535.0,0.0,13,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241520,41,41,239,26,1703,1950,827,2,1290,1.77380,3,1,1601,241521,8143.00,1290.0,0.0,8,6
241521,41,26,239,26,874,1724,827,7,686,1.06290,1,1,1601,241522,6034.81,686.0,0.0,23,10
241522,1,1,239,26,1516,1317,827,2,1541,1.13030,4,1,1601,241523,10336.84,1541.0,0.0,6,8
241523,1,1,63,26,2691,1,131,6,1430,1.23530,7,2,1601,241524,11651.70,1430.0,2146090.0,16,15


In [25]:
kaggle_data.columns

Index(['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2',
       'op_flight3', 'depDay', 'elaptime', 'detour', 'arrDay', 'stops',
       'cluster', 'id', 'real_dist', 'total_time', 'connection_time',
       'dep_hour', 'arr_hour'],
      dtype='object')

standardisation

In [26]:
#  'id', / paxe, TOT_pax, market_share
X_k = kaggle_data.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2', 'op_flight3',
                          'depDay', 'elaptime', 'detour', 'arrDay', 'stops', 'cluster', 'real_dist',
                          'total_time', 'connection_time', 'dep_hour', 'arr_hour']]

sc_X = StandardScaler()
X_k = sc_X.fit_transform(X_k)

prediction on new data

In [28]:
# Create a Random Forest Regressor
forest_k = RandomForestRegressor(n_estimators=1000, random_state=0, max_depth=60)

# Fit the Random Forest Regressor to the training data
forest_k.fit(X_train, y_train)

In [30]:
# get the predicted y values for the validation set
y_k_pred = forest.predict(X_k)
print("Random Forest predicted values for the kaggle dataset:", y_k_pred)

Random Forest predicted values for the kaggle dataset: [0.0002 0.0004 0.0005 ... 0.0011 0.0987 0.0072]


In [33]:
# get the id column from the original dataframe
id_column = kaggle_data['id'].reset_index(drop=True)

# create dataframes for predicted values with id column
val_rf = pd.concat([id_column, pd.DataFrame(y_k_pred, columns=["Predicted"])], axis=1)

# save dataframes as CSV files
val_rf.to_csv('./dataset/kaggle_pred_v2.csv', index=False)

## Try out multioutput regressor
https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html#sklearn.multioutput.MultiOutputRegressor
https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html

In [34]:
X_multirf = df.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2', 'op_flight3',
                       'depDay', 'elaptime', 'detour', 'arrDay', 'stops', 'cluster', 'real_dist',
                       'total_time', 'connection_time']]
y_multirf = df.loc[:, ['market_share', 'paxe', 'TOT_pax', 'dep_hour', 'arr_hour']]

X_multirf_train, X_multirf_test, y_multirf_train, y_multirf_test = train_test_split(X_multirf, y_multirf, train_size=0.8, random_state=0)

In [35]:
sc_X = StandardScaler()
X_multirf_train = sc_X.fit_transform(X_multirf_train)
X_multirf_test = sc_X.fit_transform(X_multirf_test)

In [36]:
from sklearn.multioutput import MultiOutputRegressor

# Create a model & Fit into the model
regr_multirf = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=1000, max_depth=60, random_state=0)
)
regr_multirf.fit(X_multirf_train, y_multirf_train)

# Predict on new data
y_m_pred_train = regr_multirf.predict(X_multirf_train)
y_m_pred_test = regr_multirf.predict(X_multirf_test)

In [37]:
print('Multioutput Regressor')
print('--------------------------------------------------------------------------------\n')
multirf_train_mse = mean_squared_error(y_multirf_train, y_m_pred_train)
multirf_test_mse = mean_squared_error(y_multirf_test, y_m_pred_test)
print("MSE (Training) = %.4f" % multirf_train_mse)
print("MSE (Testing) = %.4f" % multirf_test_mse)

Multioutput Regressor
--------------------------------------------------------------------------------

MSE (Training) = 1757.2342
MSE (Testing) = 37024.5580


In [41]:
X_k_multirf = kaggle_data.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2', 'op_flight3',
                          'depDay', 'elaptime', 'detour', 'arrDay', 'stops', 'cluster', 'real_dist',
                          'total_time', 'connection_time']]
X_k_multirf = sc_X.fit_transform(X_k_multirf)

In [42]:
y_m_pred_kaggle = regr_multirf.predict(X_k_multirf)
print("Random Forest predicted values for the kaggle dataset:", y_m_pred_kaggle)

Random Forest predicted values for the kaggle dataset: [[1.90500000e-03 2.98993333e-01 2.24685010e+03 1.27420000e+01
  2.00570000e+01]
 [2.24000000e-03 2.78640000e-01 2.24688588e+03 1.27380000e+01
  2.00560000e+01]
 [2.74000000e-03 2.69753333e-01 2.24688588e+03 1.27380000e+01
  1.99950000e+01]
 ...
 [1.87000000e-03 5.86806667e-01 2.51715387e+03 6.29500000e+00
  6.27300000e+00]
 [1.27080000e-01 1.42416000e+00 2.92813001e+02 1.46250000e+01
  1.07930000e+01]
 [1.17950000e-02 2.75026567e+00 6.48273656e+02 2.03410000e+01
  1.34410000e+01]]


In [47]:
df_kaggle = pd.DataFrame(y_m_pred_kaggle, columns=['market_share', 'paxe', 'TOT_pax', 'dep_hour', 'arr_hour'])
df_kaggle = pd.concat([id_column, df_kaggle], axis=1)
df_kaggle.to_csv('./dataset/kaggle_pred_multirf.csv')
display(df_kaggle)

Unnamed: 0,id,market_share,paxe,TOT_pax,dep_hour,arr_hour
0,1,0.001905,0.298993,2246.850104,12.742,20.057
1,2,0.002240,0.278640,2246.885880,12.738,20.056
2,3,0.002740,0.269753,2246.885880,12.738,19.995
3,4,0.001080,0.301000,2246.885880,12.920,3.755
4,5,0.002220,0.289820,2246.885880,12.738,19.932
...,...,...,...,...,...,...
241520,241521,0.505925,0.619200,5.656426,10.930,6.005
241521,241522,0.080160,0.910824,256.729315,8.268,9.533
241522,241523,0.001870,0.586807,2517.153866,6.295,6.273
241523,241524,0.127080,1.424160,292.813001,14.625,10.793


In [54]:
df_kaggle_submit = pd.concat([id_column, df_kaggle['market_share']], axis=1)
df_kaggle_submit = df_kaggle_submit.rename(columns={'market_share': 'Predicted'})
df_kaggle_submit

Unnamed: 0,id,Predicted
0,1,0.001905
1,2,0.002240
2,3,0.002740
3,4,0.001080
4,5,0.002220
...,...,...
241520,241521,0.505925
241521,241522,0.080160
241522,241523,0.001870
241523,241524,0.127080


In [56]:
df_kaggle_submit.to_csv('./dataset/kaggle_pred_multirf_submission.csv', index=False)