# Random forest

random forest regressor: multiple inputs - multiple outputs
- input
  - user: 'Orig', 'Dest', 'depDay', 'arrDay'
  - other attributes: 'con1', 'con2', 'depDay2', 'depDay3', 'elaptime', 'detour', 'paxe', 'cluster', 'TOT_pax', 'is_direct_flight', 'stops_0.0', 'stops_1.0', 'stops_2.0', 'segn_1.0', 'segn_2.0', 'segn_3.0', 'real_dist'
- output: 'market_share', 'dep_hour', 'dep_min', 'arr_hour', 'arr_min', 'con_time'

Don't think about whether it's direct or not at the moment! Go simple!

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### 1. load data

In [4]:
# read the preprocessed data
df = pd.read_csv('/content/drive/MyDrive/pdsp/dataprep_v3.csv').drop(columns=['Unnamed: 0'])
display(df.head())

Unnamed: 0,Orig,con1,con2,Dest,depDay,elaptime,detour,arrDay,paxe,cluster,...,segn_1.0,segn_2.0,segn_3.0,real_dist,dep_hour,dep_min,arr_hour,arr_min,con_time,cross_day
0,26,64,103,181,2.0,535.0,1.02676,3.0,0.0,111.0,...,0,1,0,3899.12,15,45,1,40,0,1
1,26,64,103,181,4.0,535.0,1.02676,5.0,0.0,111.0,...,0,1,0,3899.12,15,45,1,40,0,1
2,26,64,103,181,4.0,775.0,1.02676,5.0,0.0,111.0,...,0,1,0,3899.12,15,45,5,40,0,1
3,26,64,103,181,5.0,535.0,1.02676,6.0,0.0,111.0,...,0,1,0,3899.12,15,45,1,40,0,1
4,26,64,103,181,6.0,535.0,1.02676,7.0,0.0,111.0,...,0,1,0,3899.12,15,45,1,40,0,1


`A simpler model`

In [5]:
df.columns

Index(['Orig', 'con1', 'con2', 'Dest', 'depDay', 'elaptime', 'detour',
       'arrDay', 'paxe', 'cluster', 'TOT_pax', 'market_share',
       'is_direct_flight', 'stops_0.0', 'stops_1.0', 'stops_2.0', 'segn_1.0',
       'segn_2.0', 'segn_3.0', 'real_dist', 'dep_hour', 'dep_min', 'arr_hour',
       'arr_min', 'con_time', 'cross_day'],
      dtype='object')

In [6]:
# # of connections & how much time we should wait for connecting flights
# Didn't include -- 'con_time' as couldn't calculte in the kaggle dataset
X = df.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'depDay', 'elaptime', 'detour',
       'arrDay', 'cluster', 'is_direct_flight', 'stops_0.0', 'stops_1.0', 'stops_2.0', 'segn_1.0',
       'segn_2.0', 'segn_3.0', 'real_dist', 'dep_hour', 'dep_min', 'arr_hour',
       'arr_min', 'cross_day']]
X.head()

Unnamed: 0,Orig,con1,con2,Dest,depDay,elaptime,detour,arrDay,cluster,is_direct_flight,...,stops_2.0,segn_1.0,segn_2.0,segn_3.0,real_dist,dep_hour,dep_min,arr_hour,arr_min,cross_day
0,26,64,103,181,2.0,535.0,1.02676,3.0,111.0,0,...,0,0,1,0,3899.12,15,45,1,40,1
1,26,64,103,181,4.0,535.0,1.02676,5.0,111.0,0,...,0,0,1,0,3899.12,15,45,1,40,1
2,26,64,103,181,4.0,775.0,1.02676,5.0,111.0,0,...,0,0,1,0,3899.12,15,45,5,40,1
3,26,64,103,181,5.0,535.0,1.02676,6.0,111.0,0,...,0,0,1,0,3899.12,15,45,1,40,1
4,26,64,103,181,6.0,535.0,1.02676,7.0,111.0,0,...,0,0,1,0,3899.12,15,45,1,40,1


In [7]:
# Didn't include -- 'paxe', 'TOT_pax'
y = df.loc[:, 'market_share']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: market_share, dtype: float64

1.2 Splitting the dataset into training/test set
https://builtin.com/data-science/random-forest-python<br/>
https://machinelearningmastery.com/random-forest-ensemble-in-python/

In [8]:
from sklearn.model_selection import train_test_split
# random_state: controls the shuffling applied to the data before applying the split.
# Pass an int for reproducible output across multiple function calls.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# check the size of the splitted dataset
print("Shape of X_train:", X_train.shape) # expected output (418244, 22)
print("Shape of y_train:", y_train.shape) # expected output (418244,)
print("Shape of X_test:", X_test.shape) # expected output (104562, 22)
print("Shape of y_test:", y_test.shape) # expected output (104562,)

Shape of X_train: (418244, 22)
Shape of y_train: (418244,)
Shape of X_test: (104562, 22)
Shape of y_test: (104562,)


In [9]:
# Standardisation
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

### 3. Creating a random forest regression model and fitting it to the training data

In [11]:
from sklearn.ensemble import RandomForestRegressor

# Create a random forest regressor
forest = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model to the training data
forest.fit(X_train, y_train)

In [12]:
# Predict the target variable on the training data
y_train_pred = forest.predict(X_train)

# Predict the target variable on the test data
y_test_pred = forest.predict(X_test)

if (y_train_pred.all() < 2.5):
  y_train_pred = np.round(y_train_pred - 0.5)
else:
  y_train_pred = np.round(y_train_pred + 0.5)


if (y_test_pred.all() < 2.5):
  y_test_pred = np.round(y_test_pred - 0.5)
else:
  y_test_pred = np.round(y_test_pred + 0.5)

print('Random Forest')
print('--------------------------------------------------------------------------------\n')
train_error = (y_train_pred - y_train) / y_train
train_error = round(train_error.mean()*100, 2)
print("Error (Training) = ", train_error, "%")
train_acc = 100 - train_error
print("Accuracy (Training) = ", train_acc, "%")


test_error = (y_test_pred - y_test)/y_test
test_error = round(test_error.mean()*100, 2)
print("Error (Testing) = ", test_error, "%")
test_acc = 100 - test_error
print("Accuracy (Testing) = ", test_acc, "%")

Random Forest
--------------------------------------------------------------------------------

Error (Training) =  -100.0 %
Accuracy (Training) =  200.0 %
Error (Testing) =  -100.0 %
Accuracy (Testing) =  200.0 %


Try out with SVR

In [13]:
# Evaluate Model
from sklearn.metrics import mean_squared_error

mse_rf_train = mean_squared_error(y_train, y_train_pred)
mse_rf_test = mean_squared_error(y_test, y_test_pred)

# Report the result
print('Random Forest')
print('--------------------------------------------------------------------------------\n')
print('MSE (Training) = %.4f' % mse_rf_train)
print('MSE (Testing)  = %.4f' % mse_rf_test)

Random Forest
--------------------------------------------------------------------------------

MSE (Training) = 0.0054
MSE (Testing)  = 0.0051


#### Hyperparameter tuning

In [14]:
from pprint import pprint
# Parameters in use for basic model above
print('Parameters currently in use:\n')
pprint(forest.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}


#### Random hyperparameter grid

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1500, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features}
               #'max_depth': max_depth,
               #'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap}

pprint(random_grid)

{'max_features': ['auto', 'sqrt'],
 'n_estimators': [100, 255, 411, 566, 722, 877, 1033, 1188, 1344, 1500]}


#### Random search training

In [19]:
# activate GPU
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  print('GPU device not found')

Found GPU at: /device:GPU:0


In [22]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 3 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               n_iter=3, cv=3, verbose=2, random_state=0, n_jobs=-1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


  warn(


In [23]:
# View the best parameters from fitting the random search:
rf_random.best_params_

{'n_estimators': 255, 'max_features': 'auto'}

### 4. Validate the model

#### evaluate the random search

In [24]:
# from sklearn.metrics import average_precision_score, accuracy_score

def evaluate(model, X_test, y_test):
  y_pred = model.predict(X_test)
  error = np.mean((y_pred-y_test) / y_test) * 100
  accuracy = 100 - error
  return error, accuracy

# Base Model Performance
# base_model = RandomForestRegressor(random_state=0, n_estimators=255)
base_model = RandomForestRegressor(random_state=0, n_estimators=240)
base_model.fit(X_train, y_train)
base_error, base_acc = evaluate(base_model, X_test, y_test)

print('Model Performance')
print('Error: {:0.4f} degrees.'.format(base_error))
print('Accuracy = {:0.2f}%.'.format(base_acc))

Model Performance
Error: inf degrees.
Accuracy = -inf%.


In [25]:
# Random Search Model performance
best_random = rf_random.best_estimator_

RandomForestRegressor(max_features='auto', n_estimators=255)


In [27]:
_, random_accuracy = evaluate(best_random, X_test, y_test)
print('Improvement of {:0.2f}%.'.format(100 * (random_accuracy - base_acc) / base_acc))

Improvement of nan%.


  print('Improvement of {:0.2f}%.'.format(100 * (random_accuracy - base_acc) / base_acc))


In [58]:
print(forest.score(X_test, y_test))
print(base_model.score(X_test, y_test))

0.038521734355866366
0.03996835532543297


In [59]:
test_error, test_acc = evaluate(forest, X_test, y_test)
print('Random Forest with hyperparameters')
print('--------------------------------------------------------------------------------\n')
print("Error (Testing) = ", test_error, "%")
print("Accuracy (Testing) = ", test_acc, "%")

Random Forest with hyperparameters
--------------------------------------------------------------------------------

Error (Testing) =  inf %
Accuracy (Testing) =  -inf %


### 5. Save model

### 6. Predict market share for test data -- validation

Load data

In [37]:
validation_set = pd.read_csv('/content/drive/MyDrive/pdsp/testset_kaggle_prep.csv').drop(columns='Unnamed: 0')
validation_set

  validation_set = pd.read_csv('/content/drive/MyDrive/pdsp/testset_kaggle_prep.csv').drop(columns='Unnamed: 0')


Unnamed: 0,Orig,con1,con2,Dest,depDay2,depDay3,depDay,elaptime,detour,arrDay,...,stops_1,stops_2,segn_1,segn_2,segn_3,real_dist,dep_hour,dep_min,arr_hour,arr_min
0,26,64,103,181,2.0,,2,535,1.02676,3,...,1,0,0,1,0,3899.12,15,45,1,40
1,26,64,103,181,4.0,,4,535,1.02676,5,...,1,0,0,1,0,3899.12,15,45,1,40
2,26,64,103,181,5.0,,5,535,1.02676,6,...,1,0,0,1,0,3899.12,15,45,1,40
3,26,64,103,181,6.0,,5,775,1.02676,6,...,1,0,0,1,0,3899.12,15,45,5,40
4,26,64,103,181,6.0,,6,535,1.02676,7,...,1,0,0,1,0,3899.12,15,45,1,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241520,41,41,103,26,2.0,,2,1290,1.77380,3,...,1,0,0,1,0,8143.00,6,0,8,0
241521,41,26,103,26,1.0,,7,686,1.06290,1,...,1,0,0,1,0,6034.81,19,5,12,31
241522,1,1,103,26,3.0,,2,1541,1.13030,4,...,1,0,0,1,0,10336.84,23,59,10,40
241523,1,1,63,26,6.0,7.0,6,1430,1.23530,7,...,0,1,0,0,1,11651.70,9,0,17,50


In [40]:
validation_set.columns

Index(['Orig', 'con1', 'con2', 'Dest', 'depDay2', 'depDay3', 'depDay',
       'elaptime', 'detour', 'arrDay', 'cluster', 'id', 'is_direct_flight',
       'stops_0', 'stops_1', 'stops_2', 'segn_1', 'segn_2', 'segn_3',
       'real_dist', 'dep_hour', 'dep_min', 'arr_hour', 'arr_min'],
      dtype='object')

In [41]:
# cross_day flight
validation_set['cross_day'] = (validation_set['depDay'] != validation_set['arrDay']).astype(int)
validation_set.drop(columns=['depDay2', 'depDay3'], inplace=True)
print(len(validation_set.columns))

23


In [42]:
validation_set.columns

Index(['Orig', 'con1', 'con2', 'Dest', 'depDay', 'elaptime', 'detour',
       'arrDay', 'cluster', 'id', 'is_direct_flight', 'stops_0', 'stops_1',
       'stops_2', 'segn_1', 'segn_2', 'segn_3', 'real_dist', 'dep_hour',
       'dep_min', 'arr_hour', 'arr_min', 'cross_day'],
      dtype='object')

In [45]:
indexes = []
for idx, row in validation_set.iterrows():
  try:
    int(row['Orig'])
  except ValueError:
    indexes.append(idx)
validation_set.loc[indexes]

Unnamed: 0,Orig,con1,con2,Dest,depDay,elaptime,detour,arrDay,cluster,id,...,stops_2,segn_1,segn_2,segn_3,real_dist,dep_hour,dep_min,arr_hour,arr_min,cross_day
113027,XDS,41,70,26,1,730,1.01759,2,1601,113028,...,1,0,0,1,6251.889,15,55,10,5,1
113028,XDS,41,70,26,2,730,1.01759,3,1601,113029,...,1,0,0,1,6251.889,15,55,10,5,1
113029,XDS,41,70,26,4,730,1.01759,5,1601,113030,...,1,0,0,1,6251.889,15,55,10,5,1
113030,XDS,41,70,26,5,730,1.01759,6,1601,113031,...,1,0,0,1,6251.889,15,55,10,5,1
113031,XDS,41,70,26,6,730,1.01759,7,1601,113032,...,1,0,0,1,6251.889,15,55,10,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227535,XDS,41,98,26,2,990,1.02042,3,1601,227536,...,1,0,0,1,6314.492,15,55,14,25,1
227536,XDS,41,98,26,3,990,1.02042,4,1601,227537,...,1,0,0,1,6314.492,15,55,14,25,1
227537,XDS,41,98,26,4,990,1.02042,5,1601,227538,...,1,0,0,1,6314.492,15,55,14,25,1
227538,XDS,41,98,26,5,990,1.02042,6,1601,227539,...,1,0,0,1,6314.492,15,55,14,25,1


In [46]:
# validation_set.loc[indexes]['Orig'].unique() -> all 'XDS'
for idx in indexes:
  validation_set.at[idx, 'Orig'] = 41
validation_set.loc[indexes]

Unnamed: 0,Orig,con1,con2,Dest,depDay,elaptime,detour,arrDay,cluster,id,...,stops_2,segn_1,segn_2,segn_3,real_dist,dep_hour,dep_min,arr_hour,arr_min,cross_day
113027,41,41,70,26,1,730,1.01759,2,1601,113028,...,1,0,0,1,6251.889,15,55,10,5,1
113028,41,41,70,26,2,730,1.01759,3,1601,113029,...,1,0,0,1,6251.889,15,55,10,5,1
113029,41,41,70,26,4,730,1.01759,5,1601,113030,...,1,0,0,1,6251.889,15,55,10,5,1
113030,41,41,70,26,5,730,1.01759,6,1601,113031,...,1,0,0,1,6251.889,15,55,10,5,1
113031,41,41,70,26,6,730,1.01759,7,1601,113032,...,1,0,0,1,6251.889,15,55,10,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227535,41,41,98,26,2,990,1.02042,3,1601,227536,...,1,0,0,1,6314.492,15,55,14,25,1
227536,41,41,98,26,3,990,1.02042,4,1601,227537,...,1,0,0,1,6314.492,15,55,14,25,1
227537,41,41,98,26,4,990,1.02042,5,1601,227538,...,1,0,0,1,6314.492,15,55,14,25,1
227538,41,41,98,26,5,990,1.02042,6,1601,227539,...,1,0,0,1,6314.492,15,55,14,25,1


standardisation

In [47]:
#  'id',
X = validation_set.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'depDay', 'elaptime', 'detour',
                           'arrDay', 'cluster', 'is_direct_flight', 'stops_0', 'stops_1', 'stops_2', 'segn_1',
                           'segn_2', 'segn_3', 'real_dist', 'dep_hour', 'dep_min', 'arr_hour',
                           'arr_min', 'cross_day']]


from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_val = sc_X.fit_transform(X)

prediction on new data

In [51]:
# Create a Random Forest Regressor
forest = RandomForestRegressor(n_estimators=240, random_state=0)
# forest = RandomForestRegressor(n_estimators=255, random_state=0)

# Fit the Random Forest Regressor to the training data
forest.fit(X_train, y_train)

In [52]:
# get the predicted y values for the validation set
y_pred_val = forest.predict(X_val)
print("Random Forest predicted values for the kaggle dataset:", y_pred_val)

Random Forest predicted values for the kaggle dataset: [1.66666667e-04 1.66666667e-04 1.66666667e-04 ... 2.32083333e-02
 2.73250000e-01 1.95416667e-02]


In [55]:
# get the id column from the original dataframe
id_column = validation_set['id'].reset_index(drop=True)

# create dataframes for predicted values with id column
val_rf = pd.concat([id_column, pd.DataFrame(y_pred_val, columns=["Predicted"])], axis=1)

# save dataframes as CSV files
val_rf.to_csv('/content/drive/MyDrive/pdsp/kaggle_pred.csv', index=False)

## Try out multioutput regressor
https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html#sklearn.multioutput.MultiOutputRegressor
https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html

In [None]:
# ###
# # input: dep_date, arr_date
# # output: dep_hour, dep_min, arr_hour, arr_min, pax, elapsetime, real_dist

# X_multirf = df
# y_multirf = df.loc[:, ['real_dist', 'dep_date', 'dep_hour', 'dep_min',
#                    'arr_date', 'arr_hour', 'arr_min']]
# Xtrain, Xtest, ytrain, ytest = train_test_split(X_multirf, y_multirf,
#                                                 train_size=418245, test_size=104561, random_state=0)

In [None]:
# from sklearn.multioutput import MultiOutputRegressor

# # Create a random dataset
# max_depth = 30
# regr_multirf = MultiOutputRegressor(
#     RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)
# )
# regr_multirf.fit(Xtrain, ytrain)

# # Predict on new data
# y_multirf = regr_multirf.predict(Xtest)
# y_multirf