In [17]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
import matplotlib.pyplot as plt

In [18]:
RANDOM_SEED = 1
DATA_DIR = Path.cwd().parent / "train-test"

In [19]:
df_train = pd.read_parquet(DATA_DIR/"train.parquet")
df_test = pd.read_parquet(DATA_DIR/"test.parquet")

print(f"Shape of the training data : {df_train.shape}")
print(f"Shape of the test data : {df_test.shape}")

X_train = df_train.iloc[:,1:-1]
y_train = np.log(df_train.iloc[:,-1])

X_test = df_test.iloc[:,1:-1]
y_test = np.log(df_test.iloc[:,-1])

print(f"Training proportion: {len(y_train) / (len(y_train) + len(y_test))}")

Shape of the training data : (2060626, 13)
Shape of the test data : (515157, 13)
Training proportion: 0.7999998447074151


In [20]:
def get_scores(estimator, is_train):
    X = X_train if is_train else X_test
    y_true = y_train if is_train else y_test
    pred = estimator.predict(X)
    rmse = mean_squared_error(y_true, pred, squared=False)
    mae = mean_absolute_error(y_true, pred)
    return {"RMSE": rmse, "MAE": mae}

In [21]:
def run_experiment(model, hps, n_splits=5, tf_type=0, verbose=3):
    if tf_type == 0:
        pipe = make_pipeline(StandardScaler(), model)
    elif tf_type == 1:
        # note that this will reorder the columns. First few would be those from ptf.
        # r_length, u_friends_count, u_review_count, u_month_age, b_review_count, r_rea
        pipe = make_pipeline(
            ColumnTransformer(
                [("ptf", PowerTransformer(), [2, 3, 4, 5, 7, 10])],
                remainder=StandardScaler()),
            model)
    elif tf_type == 2:
        pipe = make_pipeline(PowerTransformer(), model)
        
    cv_search = GridSearchCV(pipe, hps,
        scoring=["neg_root_mean_squared_error", "neg_mean_absolute_error"],
        refit="neg_root_mean_squared_error",
        n_jobs=-1, cv=KFold(n_splits), verbose=verbose)

    cv_search.fit(X_train, y_train)
    
    print("------ Best HP set ------")
    print(cv_search.cv_results_["params"][cv_search.best_index_])
    print("------ Best Mean CV test RMSE, MAE ------")
    print((-1 * cv_search.cv_results_["mean_test_neg_root_mean_squared_error"][cv_search.best_index_],
        -1 * cv_search.cv_results_["mean_test_neg_mean_absolute_error"][cv_search.best_index_]))
    print("------ Best model train RMSE, MAE ------")
    print(get_scores(cv_search, is_train=True))
    print("------ Best model test RMSE, MAE ------")
    print(get_scores(cv_search, is_train=False))

In [22]:
hp = {
    "sgdregressor__alpha": 10.0**-np.arange(1,7),
    "sgdregressor__eta0": 10.0**-np.arange(1,7),
    "sgdregressor__power_t": [0.125, 0.25, 0.5]
}
run_experiment(SGDRegressor(random_state=RANDOM_SEED), hp, tf_type=0, verbose=1)
run_experiment(SGDRegressor(random_state=RANDOM_SEED), hp, tf_type=1, verbose=1)
run_experiment(SGDRegressor(random_state=RANDOM_SEED), hp, tf_type=2, verbose=1)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
------ Best HP set ------
{'sgdregressor__alpha': 1e-06, 'sgdregressor__eta0': 0.001, 'sgdregressor__power_t': 0.25}
------ Best Mean CV test RMSE, MAE ------
(0.967352879566654, 0.7364592041338988)
------ Best model train RMSE, MAE ------
{'RMSE': 0.9673533004568888, 'MAE': 0.7362305469477312}
------ Best model test RMSE, MAE ------
{'RMSE': 0.964774732004302, 'MAE': 0.7359761155664327}
Fitting 5 folds for each of 108 candidates, totalling 540 fits
------ Best HP set ------
{'sgdregressor__alpha': 1e-06, 'sgdregressor__eta0': 0.001, 'sgdregressor__power_t': 0.25}
------ Best Mean CV test RMSE, MAE ------
(0.9884599347916602, 0.7507881449702583)
------ Best model train RMSE, MAE ------
{'RMSE': 0.9884316130032185, 'MAE': 0.750375735541275}
------ Best model test RMSE, MAE ------
{'RMSE': 0.9861773563392486, 'MAE': 0.7502866172411401}
Fitting 5 folds for each of 108 candidates, totalling 540 fits
------ Best HP set ------
{'

In [24]:
hp = {
    "decisiontreeregressor__max_depth": [2,4,6,8,10,12,14,16,20],
    "decisiontreeregressor__ccp_alpha": 0.1 * np.arange(0,8)
}
run_experiment(DecisionTreeRegressor(random_state=RANDOM_SEED), hp, tf_type=0, verbose=2)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=2; total time=   5.8s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=2; total time=   6.5s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=2; total time=   6.6s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=2; total time=   7.7s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=2; total time=   8.5s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=4; total time=  10.3s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=4; total time=  10.5s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=4; total time=  11.0s
[CV] END decisiontreeregressor__ccp_alpha=0.0, decisiontreeregressor__max_depth=4; total time=  11.6s
[CV] END decisiontre

In [27]:
y_train_mean = np.mean(y_train)
print(np.sqrt(np.mean((y_train_mean - y_test)**2))) # RMSE
print(np.mean(np.abs(y_train_mean - y_test))) # MAE

1.1099838176001227
0.8525313957684546
