In [8]:
import pandas as pd
pd.options.display.max_seq_items = 2000

# Random Forest Modelling

### Data Preparation

In [238]:
journey_train = pd.read_csv('../data/processed/journey_data_2018_01.csv', parse_dates=['end_date', 'start_date'], index_col=0)
journey_test = pd.read_csv('../data/processed/journey_data_2019_01.csv', parse_dates=['end_date', 'start_date'], index_col=0)

In [219]:
journey_train.head()

Unnamed: 0,rental_id,end_date,end_station_name,start_date,start_station_name,day_of_week,hour,is_weekend,part_of_day,month,...,election_seats_percentage_lab,election_seats_percentage_ld,election_seats_percentage_gre,election_seats_percentage_ind,occupation_high_level_ratio_ratio,occupation_small_intermediate_ratio_ratio,occupation_lower_level_ratio_ratio,occupation_unemployed_ratio_ratio,occupation_student_ratio_ratio,earnings_workplace
0,72337672.0,2018-01-01 00:07:00,"Millbank Tower, Pimlico",2018-01-01 00:00:00,"Grosvenor Road, Pimlico",0,0,0.0,5,1.0,...,0.32,0.0,0.0,0.0,0.443089,0.161098,0.148262,0.118178,0.129374,60046.0
1,72337673.0,2018-01-01 01:16:00,"South Parade, Chelsea",2018-01-01 00:02:00,"South Parade, Chelsea",0,0,0.0,5,1.0,...,0.26,0.02,0.0,0.0,0.460353,0.177567,0.144639,0.10478,0.112661,41723.0
2,72337675.0,2018-01-01 00:58:00,"Portman Square, Marylebone",2018-01-01 00:02:00,"Portman Square, Marylebone",0,0,0.0,5,1.0,...,0.32,0.0,0.0,0.0,0.443089,0.161098,0.148262,0.118178,0.129374,60046.0
3,72337676.0,2018-01-01 00:59:00,"Portman Square, Marylebone",2018-01-01 00:02:00,"Portman Square, Marylebone",0,0,0.0,5,1.0,...,0.32,0.0,0.0,0.0,0.443089,0.161098,0.148262,0.118178,0.129374,60046.0
4,72337679.0,2018-01-01 14:11:00,"Stockwell Roundabout, Stockwell",2018-01-01 00:02:00,"New Fetter Lane, Holborn",0,0,0.0,5,1.0,...,0.733636,0.027273,0.010909,0.003636,0.635932,0.125062,0.084708,0.054598,0.0997,88145.0


In [246]:
# aggregate demand for each borough by summing
journey_train['demand'] = 1
journey_test['demand'] = 1

# caet a list of all column names except 'demand' and 'start_date'
column_names = [col for col in journey_train.columns]

# create a dictionary with all features, first -> always keep values of first element
aggregate_functions = {col: 'first' for col in column_names}

# add an entry for the 'demand' column with 'sum' as the aggregate function
aggregate_functions['demand'] = 'sum'

# perform the groupby operation
journey_train_hourly = journey_train.groupby([journey_train['start_date'].dt.floor('H'), 'start_borough']).agg(aggregate_functions).reset_index(drop=True)
journey_test_hourly = journey_test.groupby([journey_test['start_date'].dt.floor('H'), 'start_borough']).agg(aggregate_functions).reset_index(drop=True)

# one hot encoding start_borough
journey_train_hourly = pd.get_dummies(journey_train_hourly, columns=['start_borough'])
journey_test_hourly = pd.get_dummies(journey_test_hourly, columns=['start_borough'])

# create the target variables
y_train = journey_train_hourly['demand']
y_test = journey_test_hourly['demand']

# create the predictor variables
cols_to_remove = ['rental_id', 'end_date', 'end_borough', 'start_date', 'end_station_name', 'start_station_name', 'demand', 'borough', 'borough_code', 'year']
x_train = journey_train_hourly.drop(columns=cols_to_remove)
x_test = journey_test_hourly.drop(columns=cols_to_remove)


In [247]:
from sklearn.ensemble import RandomForestRegressor

# Initialize a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the train data
rf.fit(x_train, y_train)

# Predict on the test set
y_pred = rf.predict(x_test)

In [248]:
# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame to display the feature importances
feature_importances = pd.DataFrame({'Feature': x_train.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

# Display the feature importances
print(feature_importances)

                                       Feature  Importance
1                                         hour    0.305499
3                                  part_of_day    0.154383
15                                  cloudcover    0.062021
33              ethnic_african_caribbean_ratio    0.035869
0                                  day_of_week    0.034731
36                     ethnic_arab_other_ratio    0.024127
12                                      precip    0.023951
58                         crime_offences_rate    0.022634
22                           bike_docks_counts    0.021858
21                         bike_station_counts    0.021059
76               residence_lengh_uk_born_ratio    0.020925
20                                 day_of_year    0.019938
16                                  visibility    0.018323
105                  start_borough_Westminster    0.018137
9                                         temp    0.018010
18                              daylight_hours    0.0167

In [249]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, explained_variance_score

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared (R2) Score
r2 = r2_score(y_test, y_pred)


# Calculate Explained Variance Score
evs = explained_variance_score(y_test, y_pred)

print('RMSE:', rmse)
print('MAE:', mae)
print('R2 Score:', r2)
print('Explained Variance Score:', evs)

RMSE: 64.25754352148373
MAE: 30.362835428237357
R2 Score: 0.7241482978557934
Explained Variance Score: 0.7360997690958823
