In [14]:
from sklearn.neighbors import KNeighborsRegressor
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVR
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
import optuna as opt
from sklearn.preprocessing import StandardScaler
from optuna.samplers import RandomSampler
from optuna.samplers import TPESampler
from sklearn.preprocessing import MinMaxScaler

In [3]:
CWD = Path.cwd()
DATA_DIR = CWD.parent / "ready data"
print(DATA_DIR)

/Users/antonyjiao/Desktop/UOA Master of DS/COMPSCI 760/ready data


In [4]:
train = pd.read_parquet(DATA_DIR/"train_main.parquet.snappy")
val = pd.read_parquet(DATA_DIR/"val_main.parquet.snappy")
test = pd.read_parquet(DATA_DIR/"test_main.parquet.snappy")

In [5]:
X_train = train.iloc[:,1:-1]
y_train = train.iloc[:,-1]

X_val = val.iloc[:,1:-1]
y_val = val.iloc[:,-1]

X_test = test.iloc[:,1:-1]
y_test = test.iloc[:,-1]

In [6]:
X_train.head()

Unnamed: 0,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea
0,5,25,54,1,2,23.093032,4.0,7400,0.197348,0.293939,72.22
1,1,1,224,47,43,57.182693,3.0,27,0.088636,0.320328,89.28
2,4,16,18,35,112,44.484607,4.0,103,0.36875,0.4125,87.72
3,5,25,86,156,21,28.229579,5.0,5,0.15,0.485,49.86
4,5,25,79,6660,2492,40.995708,4.0,709,0.345833,0.541667,57.78


In [7]:
# standardise the features

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)
X_test = ss.transform(X_test)


In [8]:
y_train = y_train.values
y_val = y_val.values
y_test = y_test.values

# Use Optuna to tune paramaters

In [9]:
def objective(trial):
    params = {
       "n_neighbors":trial.suggest_int('n_neighbors', 2, 100),
       "weights":trial.suggest_categorical("weights", ['uniform', 'distance']),
       "metric":trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
  }


    model = KNeighborsRegressor(**params)
  
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return mean_squared_error(y_val, y_pred, squared=False)

In [9]:
RANDOM_SEED = 2
study = opt.create_study(direction='minimize', sampler=TPESampler(seed=RANDOM_SEED))
study.optimize(objective, n_trials=50)
study.best_params

[32m[I 2022-09-23 19:42:04,426][0m A new study created in memory with name: no-name-dd804a4f-6d29-4856-92bb-2f6d2a400277[0m
[32m[I 2022-09-23 19:53:00,295][0m Trial 0 finished with value: 3.620883718995509 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 3.620883718995509.[0m
[32m[I 2022-09-23 20:10:46,259][0m Trial 1 finished with value: 3.609878495125395 and parameters: {'n_neighbors': 22, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 1 with value: 3.609878495125395.[0m
[32m[I 2022-09-23 20:26:05,522][0m Trial 2 finished with value: 3.6291339735255135 and parameters: {'n_neighbors': 15, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 1 with value: 3.609878495125395.[0m
[32m[I 2022-09-23 20:56:32,327][0m Trial 3 finished with value: 3.588847833144679 and parameters: {'n_neighbors': 85, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 3 with value: 3.588847833144679.[0m
[

[32m[I 2022-09-24 08:48:59,878][0m Trial 37 finished with value: 3.5806089589792314 and parameters: {'n_neighbors': 44, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 31 with value: 3.579160505360945.[0m
[32m[I 2022-09-24 09:14:32,367][0m Trial 38 finished with value: 3.5785219808313804 and parameters: {'n_neighbors': 51, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 38 with value: 3.5785219808313804.[0m
[32m[I 2022-09-24 09:28:20,595][0m Trial 39 finished with value: 3.6273855217256035 and parameters: {'n_neighbors': 72, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 38 with value: 3.5785219808313804.[0m
[32m[I 2022-09-24 09:59:19,875][0m Trial 40 finished with value: 3.5882148389817408 and parameters: {'n_neighbors': 83, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 38 with value: 3.5785219808313804.[0m
[32m[I 2022-09-24 10:24:35,356][0m Trial 41 finished with value: 3.5788850029602086 and parameters: {'n_neighbor

{'n_neighbors': 53, 'weights': 'distance', 'metric': 'manhattan'}

## Fit the best model

In [8]:
model = KNeighborsRegressor(n_neighbors=53, weights='distance', metric='manhattan')
model.fit(X_train, y_train)

In [11]:
def predict_scores(model, X, y_true):
    y_pred = model.predict(X)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    return {"RMSE": rmse, "MAE": mae}

In [11]:
print("Train")
predict_scores(model, X_train, y_train)

Train


{'RMSE': 0.0011014642774033146, 'MAE': 1.4558682652747272e-06}

In [12]:
print("Test")
predict_scores(model, X_test, y_test)

Test


{'RMSE': 3.8658359875544495, 'MAE': 1.5875182972464363}

In [12]:
# Try decrease the n_neighbors
model = KNeighborsRegressor(n_neighbors=20, weights='distance', metric='manhattan')
model.fit(X_train, y_train)
print("Train")
predict_scores(model, X_train, y_train)
print("Test")
predict_scores(model, X_test, y_test)

Train
Test


{'RMSE': 3.906448855073718, 'MAE': 1.6141914358684413}

# MinMaxScaler Dataset

In [15]:
train = pd.read_parquet(DATA_DIR/"train_main.parquet.snappy")
val = pd.read_parquet(DATA_DIR/"val_main.parquet.snappy")
test = pd.read_parquet(DATA_DIR/"test_main.parquet.snappy")
X_train = train.iloc[:,1:-1]
y_train = train.iloc[:,-1]

X_val = val.iloc[:,1:-1]
y_val = val.iloc[:,-1]

X_test = test.iloc[:,1:-1]
y_test = test.iloc[:,-1]

ss = MinMaxScaler() # MinMaxScaler transfer
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)
X_test = ss.transform(X_test)

y_train = y_train.values
y_val = y_val.values
y_test = y_test.values

In [17]:
model = KNeighborsRegressor(n_neighbors=53, weights='distance', metric='manhattan')
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)


mae = mean_absolute_error(y_train, y_pred_train)
mse = mean_squared_error(y_train, y_pred_train)
rmse = np.sqrt(mse)
print("On training set: RMSE:", rmse, "MAE", mae)

#Prediction(test) 
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
print("On test set: RMSE:", rmse, "MAE", mae)

On training set: RMSE: 0.0011014642774033146 MAE 1.4558682652747272e-06
On test set: RMSE: 3.993717421325375 MAE 1.6110233225428847
