In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer

In [2]:
CWD = Path.cwd()
DATA_DIR = CWD.parent / "18features_data"
print(DATA_DIR)

/Users/antonyjiao/Desktop/UOA_Master_of_DS/COMPSCI 760/18features_data


In [3]:
df_train = pd.read_parquet(DATA_DIR/"18_train_main.parquet.snappy")
df_test = pd.read_parquet(DATA_DIR/"18_test_main.parquet.snappy")
# train_text = pd.read_parquet(DATA_DIR/"train_text.parquet.snappy")
#test_text = pd.read_parquet(DATA_DIR/"test_text.parquet.snappy")

In [4]:
df_train.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,u_avg_stars,u_give_useful,b_stars,b_review_count,b_days_open_wk,b_hours_open_wk,r_sen,r_sub,r_rea,r_useful
0,1984442,5,25,80,173,129,108.203988,0.178295,3,6,4.5,118,5.0,10,6.0,60.0,0.279954,0.433241,66.44,2
1,2807253,4,16,171,128,330,58.652416,0.909091,7,21,3.84,891,4.0,711,6.0,48.0,0.272727,0.474242,72.66,3
2,5432293,3,9,114,277,1064,64.378573,1.753759,2,57,4.21,9948,3.5,154,6.0,33.0,0.146667,0.294583,68.13,18
3,3967382,5,25,34,35,59,102.077493,0.067797,0,0,4.3,48,4.0,109,7.0,71.0,0.383333,0.541667,76.93,1
4,656666,1,1,261,137,5,44.994772,0.0,0,0,3.4,5,4.0,18,5.0,42.5,0.039118,0.300063,88.97,3


In [5]:
df_test.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,u_comp_avg,u_n_elite_yrs,u_fans,u_avg_stars,u_give_useful,b_stars,b_review_count,b_days_open_wk,b_hours_open_wk,r_sen,r_sub,r_rea,r_useful
0,2392151,4,16,235,91,108,3.528048,0.527778,3,4,3.69,127,3.5,293,7.0,87.5,0.035417,0.161285,87.55,1
1,1503740,3,9,157,107,39,0.336006,0.512821,0,3,3.16,106,2.5,1249,7.0,168.0,0.14789,0.419108,93.95,1
2,5868535,4,16,355,60,70,26.598261,0.071429,0,0,4.29,66,2.5,43,,,0.128257,0.418071,78.28,1
3,5873696,5,25,36,1,20,84.82925,0.1,0,0,4.1,8,4.0,11,6.0,68.0,0.256818,0.538159,78.59,1
4,3475838,4,16,64,180,258,87.693135,0.27907,6,35,4.16,543,5.0,5,7.0,56.0,0.270139,0.456944,94.45,1


In [6]:
print(f"Shape of the training data : {df_train.shape}")
print(f"Shape of the test data : {df_test.shape}")

Shape of the training data : (400000, 20)
Shape of the test data : (100000, 20)


In [7]:
X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values

In [8]:
n_list = list(range(1,100,2))
# print(n_list)
param_grid = {'knn__n_neighbors':n_list}  
print(param_grid)

{'knn__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99]}


In [9]:
pipe = Pipeline(steps=[('imputer',SimpleImputer()),
                        ('ss', StandardScaler()), 
                        ('knn', KNeighborsRegressor(n_jobs=-1))])

grid_search = GridSearchCV(pipe,param_grid, verbose=3,
    	cv=KFold(3), scoring='neg_root_mean_squared_error')

print("start fitting")
grid_search.fit(X_train,y_train) 
print('best_score:',grid_search.best_score_) 
print('best_params:',grid_search.best_params_)  

start fitting
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3] END ...............knn__n_neighbors=1;, score=-5.271 total time= 1.2min
[CV 2/3] END ...............knn__n_neighbors=1;, score=-4.962 total time=  47.1s
[CV 3/3] END ...............knn__n_neighbors=1;, score=-5.027 total time=  38.2s
[CV 1/3] END ...............knn__n_neighbors=3;, score=-4.678 total time=  36.1s
[CV 2/3] END ...............knn__n_neighbors=3;, score=-3.985 total time=  36.1s
[CV 3/3] END ...............knn__n_neighbors=3;, score=-4.406 total time=  34.9s
[CV 1/3] END ...............knn__n_neighbors=5;, score=-4.426 total time=  33.9s
[CV 2/3] END ...............knn__n_neighbors=5;, score=-3.838 total time=  32.2s
[CV 3/3] END ...............knn__n_neighbors=5;, score=-4.372 total time=  31.9s
[CV 1/3] END ...............knn__n_neighbors=7;, score=-4.345 total time=  31.9s
[CV 2/3] END ...............knn__n_neighbors=7;, score=-3.760 total time=  31.9s
[CV 3/3] END ...............knn__

In [10]:
train_pred = grid_search.predict(X_train)
test_pred = grid_search.predict(X_test)
print(f"train results - RMSE: {mean_squared_error(y_train, train_pred, squared=False)}, MAE: {mean_absolute_error(y_train, train_pred)}")
print(f"test results - RMSE: {mean_squared_error(y_test, test_pred, squared=False)}, MAE: {mean_absolute_error(y_test, test_pred)}")

train results - RMSE: 3.878515439047021, MAE: 1.486085564516129
test results - RMSE: 3.4444486199572633, MAE: 1.5309616129032257


In [11]:
mean_train = np.mean(y_train)
print(f"Baseline mean model - test RMSE: {np.sqrt(np.mean((y_test - mean_train)**2))}")
print(f"Baseline mean model - test MAE: {np.mean(np.abs(y_test - mean_train))}")

Baseline mean model - test RMSE: 4.1185891526597125
Baseline mean model - test MAE: 1.8970029502499997
