In [6]:
from sklearn.neighbors import KNeighborsRegressor
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVR
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
import optuna as opt

In [7]:
CWD = Path.cwd()
DATA_DIR = CWD.parent / "ready data"
print(DATA_DIR)

/Users/antonyjiao/Desktop/UOA Master of DS/COMPSCI 760/ready data


In [8]:
train = pd.read_parquet(DATA_DIR/"train_main.parquet.snappy")
val = pd.read_parquet(DATA_DIR/"val_main.parquet.snappy")
test = pd.read_parquet(DATA_DIR/"test_main.parquet.snappy")

In [9]:
train.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea,r_useful
0,5508740,5,25,54,1,2,23.093032,4.0,7400,0.197348,0.293939,72.22,1
1,1633913,1,1,224,47,43,57.182693,3.0,27,0.088636,0.320328,89.28,5
2,1427664,4,16,18,35,112,44.484607,4.0,103,0.36875,0.4125,87.72,1
3,5536866,5,25,86,156,21,28.229579,5.0,5,0.15,0.485,49.86,2
4,3380896,5,25,79,6660,2492,40.995708,4.0,709,0.345833,0.541667,57.78,14


## Data split

In [10]:
X_train = train.iloc[:,1:-1]
y_train = train.iloc[:,-1]

X_val = val.iloc[:,1:-1]
y_val = val.iloc[:,-1]

X_test = test.iloc[:,1:-1]
y_test = test.iloc[:,-1]

In [8]:
y_train.head(5)

0     1
1     5
2     1
3     2
4    14
Name: r_useful, dtype: int64

## Fit KNN regressions model on subset of the dataset

In [9]:
X_train_subset = X_train[:5000]
y_train_subset = y_train[:5000]

X_test_subset = X_test[:5000]
y_test_subset = y_test[:5000]

X_val_subset = X_val[:5000]
y_val_subset = y_val[:5000]

In [11]:
def predict_scores(model, X, y_true):
    y_pred = model.predict(X)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    return {"RMSE": rmse, "MAE": mae}

In [12]:
reg=KNeighborsRegressor(n_neighbors=5)
reg.fit(X_train_subset.values,y_train_subset.values)
predict_scores(reg,X_test_subset.values,y_test_subset.values)

{'RMSE': 3.6981552157798894, 'MAE': 1.74968}

## Test on full dataset with different n_neighbors

In [13]:
def experiment(HPdict, model_args):
    grid = ParameterGrid(HPdict)

    scores = pd.DataFrame(columns=["RMSE", "MAE", "is_val"])
    for i, hps in enumerate(grid):
        print(f"training {i+1}th model")
        model = KNeighborsRegressor(**model_args)
        model.set_params(**hps)
        model.fit(X_train.values, y_train.values)
        
        row = predict_scores(model, X_train.values, y_train.values)
        scores = pd.concat(
            [scores, pd.DataFrame({**hps, **row, "is_val": False}, index=[0])],
            ignore_index=True)
        
        row = predict_scores(model, X_val, y_val)
        scores = pd.concat(
            [scores, pd.DataFrame({**hps, **row, "is_val": True}, index=[0])],
            ignore_index=True)
    return scores

In [14]:
hyperparams = {
    'n_neighbors':[k for k in range(1,8)]
}
model_args = {}
scores = experiment(hyperparams, model_args)
print("Done")

training 1th model




training 2th model




training 3th model




training 4th model




training 5th model




training 6th model




training 7th model




Done


In [15]:
print(scores[scores.is_val == False])
print(scores[scores.is_val == True])

        RMSE       MAE is_val  n_neighbors
0   0.001558  0.000001  False          1.0
2   2.862291  0.980252  False          2.0
4   3.304249  1.206809  False          3.0
6   3.527484  1.309371  False          4.0
8   3.656286  1.366956  False          5.0
10  3.737411  1.403955  False          6.0
12  3.797841  1.429741  False          7.0
        RMSE       MAE is_val  n_neighbors
1   4.582596  1.941820   True          1.0
3   4.133150  1.796797   True          2.0
5   3.918083  1.734220   True          3.0
7   3.807233  1.697444   True          4.0
9   3.720104  1.673258   True          5.0
11  3.672652  1.656363   True          6.0
13  3.647463  1.645237   True          7.0


## Optuna for the model

In [11]:
X_train = X_train.values
y_train = y_train.values

X_val = X_val.values
y_val = y_val.values

X_test = X_test.values
y_test = y_test.values

In [35]:
def objective(trial):
    params = {
       "n_neighbors":trial.suggest_categorical('n_neighbors', [1, 2, 4, 8, 10]),
       "weights":trial.suggest_categorical("weights", ['uniform', 'distance']),
       "metric":trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
  }


    model = KNeighborsRegressor(**params)
  
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    return mean_squared_error(y_val, y_pred, squared=False)

In [36]:
study = opt.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
study.best_params

[32m[I 2022-09-22 21:08:36,118][0m A new study created in memory with name: no-name-b2517564-8be3-43e0-9db8-e0ac751b57e2[0m
[32m[I 2022-09-22 21:09:02,727][0m Trial 0 finished with value: 4.133149569795027 and parameters: {'n_neighbors': 2, 'weights': 'uniform', 'metric': 'minkowski'}. Best is trial 0 with value: 4.133149569795027.[0m
[32m[I 2022-09-22 21:09:24,398][0m Trial 1 finished with value: 4.582596027506551 and parameters: {'n_neighbors': 1, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 4.133149569795027.[0m
[32m[I 2022-09-22 21:10:04,396][0m Trial 2 finished with value: 3.6026124629331457 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 2 with value: 3.6026124629331457.[0m
[32m[I 2022-09-22 21:11:22,679][0m Trial 3 finished with value: 3.572279238243582 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 3 with value: 3.572279238243582.[0m
[32m

[32m[I 2022-09-22 21:48:16,139][0m Trial 37 finished with value: 4.582596027506551 and parameters: {'n_neighbors': 1, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 26 with value: 3.5100299349372204.[0m
[32m[I 2022-09-22 21:49:35,003][0m Trial 38 finished with value: 3.547160292924287 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 26 with value: 3.5100299349372204.[0m
[32m[I 2022-09-22 21:50:01,296][0m Trial 39 finished with value: 4.11327131721669 and parameters: {'n_neighbors': 2, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 26 with value: 3.5100299349372204.[0m
[32m[I 2022-09-22 21:51:26,119][0m Trial 40 finished with value: 3.5100299349372204 and parameters: {'n_neighbors': 10, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 26 with value: 3.5100299349372204.[0m
[32m[I 2022-09-22 21:52:51,108][0m Trial 41 finished with value: 3.5100299349372204 and parameters: {'n_neighbors': 1

{'n_neighbors': 10, 'weights': 'distance', 'metric': 'manhattan'}

In [37]:
model = KNeighborsRegressor(**study.best_params)
model.fit(X_train, y_train)

In [39]:
predict_scores(model, X_train, y_train)

{'RMSE': 0.0011014642774033146, 'MAE': 1.4558682652747272e-06}

In [40]:
predict_scores(model, X_test, y_test)

{'RMSE': 3.8256852525131007, 'MAE': 1.6036712776736195}

In [43]:
model = KNeighborsRegressor(n_neighbors= 100)
model.fit(X_train, y_train)
predict_scores(model, X_test, y_test)

{'RMSE': 3.8628788287308633, 'MAE': 1.602172343242267}

In [44]:
model = KNeighborsRegressor(n_neighbors= 10000)
model.fit(X_train, y_train)
predict_scores(model, X_test, y_test)

{'RMSE': 4.030031551030791, 'MAE': 1.6457304877338603}

In [12]:
y_train

array([1, 5, 1, ..., 4, 6, 1])