# Random forest

random forest regressor: multiple inputs - multiple outputs
- input
  - user: 'Orig', 'Dest', 'depDay', 'arrDay' 
  - other attributes: 'con1', 'con2', 'op_flight1', 'op_flight2', 'op_flight3',
               'elaptime', 'detour', 'stops', 'cluster', 'real_dist',
               'total_time', 'connection_time', 'dep_hour', 'arr_hour'
- output: 'market_share', 'paxe', 'TOT_pax'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor

### 1. load data

In [None]:
# read the preprocessed data
df = pd.read_csv('./dataset/dataprep_v2.csv').drop(columns=['Unnamed: 0'])
display(df.head())

### 2. Splitting the dataset into training/test set

In [None]:
X = df.loc[:, ['Orig', 'con1', 'con2', 'Dest', 'op_flight1', 'op_flight2', 'op_flight3',
               'depDay', 'elaptime', 'detour', 'arrDay', 'stops', 'cluster', 'real_dist',
               'total_time', 'connection_time', 'dep_hour', 'arr_hour']]
y = df.loc[:, ['market_share', 'paxe', 'TOT_pax']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

In [None]:
display(X.head())
display(y.head())

In [None]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

### 3. Creating a random forest regression model and fitting it to the training data

In [None]:
# Create a model
regr_multirf = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=1000, max_depth=60, random_state=0)
)

# Fit into the model
regr_multirf.fit(X_train, y_train)

In [None]:
# Predict on new data
y_pred_train = regr_multirf.predict(X_train)
y_pred_test = regr_multirf.predict(X_test)

print('Multioutput Regressor')
print('--------------------------------------------------------------------------------\n')
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
print("MSE (Training) = %.4f" % train_mse)
print("MSE (Testing) = %.4f" % test_mse)

### 4. Concatenating dataframes

In [None]:
# train
y_pred_train_df = pd.DataFrame(y_pred_train, columns=['market_share_pred', 'paxe_pred', 'TOT_pax_pred'])
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
train_df = pd.concat([train_df, y_pred_train_df], axis=1)
# test
y_pred_test_df = pd.DataFrame(y_pred_test, columns=['market_share_pred', 'paxe_pred', 'TOT_pax_pred'])
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_df = pd.concat([test_df, y_pred_test_df], axis=1)
# whole
df_new = pd.concat([train_df, test_df] axis=0).reset_index(drop=True)

In [None]:
df_new.to_csv('./dataset/kaggle_pred_multirf.csv', index=False)