In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("data/processed.csv",index_col=0)

In [3]:
X = df.drop(['Vehicle Population'],axis=1)

In [4]:
y = df['Vehicle Population']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [6]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [7]:
# scale to log(1+y) and during prediction exp1m (y) then predict
y_train = np.log1p(y_train)

In [8]:
model.fit(X_train,y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
y_pred = np.expm1(y_pred)

In [11]:
print(f"Training RMSE: {root_mean_squared_error(y_test, y_pred)}")
print(f"Training R²: {r2_score(y_test, y_pred)}")

Training RMSE: 3960.016965043891
Training R²: 0.9556244471184836


In [12]:
scoring_df = pd.read_excel("../data/Chevron Challenge Materials/scoring.xlsx")

In [13]:
# preprocessing testing dataset
from convert import clean_encode_columns, impute_df, fill_missing_columns

In [14]:
scoring_df = clean_encode_columns(scoring_df)

In [15]:
scoring_df = impute_df(scoring_df)

In [16]:
scoring_X = scoring_df.drop(['Vehicle Population'],axis=1)

In [17]:
scoring_y = scoring_df['Vehicle Population']

In [18]:
scoring_X = fill_missing_columns(X,scoring_X)

In [19]:
scoring_predictions = model.predict(scoring_X)

In [20]:
scoring_predictions = np.expm1(scoring_predictions)

In [21]:
print(f"Scoring RMSE: {root_mean_squared_error(scoring_y, scoring_predictions)}")
print(f"Scoring R²: {r2_score(scoring_y, scoring_predictions)}")

Scoring RMSE: 9458.022475574733
Scoring R²: 0.7639368364378896


In [None]:
scor