In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor, NearestCentroid
from sklearn.model_selection import train_test_split, GridSearchCV

#### Loading data

In [2]:
df = pd.read_csv('OnlineNewsPopularityReduced.csv')
df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2014/09/28/jobs-tech-workers/,101,12,332,0.528481,1.0,0.645833,6,3,1,...,0.1,0.7,-0.145833,-0.166667,-0.125,0.718182,0.137121,0.218182,0.137121,8600
1,http://mashable.com/2014/09/28/obama-says-u-s-...,101,10,393,0.518817,1.0,0.596899,14,1,1,...,0.25,0.8,-0.3125,-0.5,-0.25,0.5,0.0,0.0,0.0,23700
2,http://mashable.com/2014/09/28/once-upon-a-tim...,101,13,1643,0.388854,1.0,0.546967,29,4,7,...,0.033333,0.8,-0.227778,-0.8,-0.05,0.0,0.0,0.5,0.0,8400
3,http://mashable.com/2014/09/28/the-simpsons-de...,101,9,272,0.488889,1.0,0.51269,2,2,11,...,0.1,1.0,-0.416667,-1.0,-0.166667,0.0,0.0,0.5,0.0,4800
4,http://mashable.com/2014/09/28/viola-davis-les...,101,12,0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.541071,0.06756,0.041071,0.06756,3100


#### A bit of preprocessing, this dataset doesn't require a lot.

In [3]:
df = df[df.n_tokens_content != 0]
df.drop(columns=['url', 'kw_min_min'], inplace=True)

In [4]:
df.dtypes

timedelta                          int64
n_tokens_title                     int64
n_tokens_content                   int64
n_unique_tokens                  float64
n_non_stop_words                 float64
n_non_stop_unique_tokens         float64
num_hrefs                          int64
num_self_hrefs                     int64
num_imgs                           int64
num_videos                         int64
average_token_length             float64
num_keywords                       int64
data_channel_is_lifestyle          int64
data_channel_is_entertainment      int64
data_channel_is_bus                int64
data_channel_is_socmed             int64
data_channel_is_tech               int64
data_channel_is_world              int64
kw_max_min                       float64
kw_avg_min                       float64
kw_min_max                         int64
kw_max_max                         int64
kw_avg_max                       float64
kw_min_avg                       float64
kw_max_avg      

#### Data scaling and splitting into train/validation sets

In [5]:
X = df.drop(columns='shares')
y = df.shares
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=4)

#### Basic KNN performance

In [6]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)
np.sqrt(mean_squared_error(model.predict(X_valid), y_valid))

6871.123954596296


#### Parameters tuning for KNN

In [7]:
# model = KNeighborsRegressor(weights='distance')
# params = {
#     'n_neighbors': np.arange(1, 50, 2),
#     'p': np.linspace(1, 10, 21)
# }
# search = GridSearchCV(model, params, cv=4, scoring='neg_mean_squared_error')
# search.fit(X, y)

In [8]:
# print(search.best_params_)
# print(search.best_score_)

#### Best KNN model

In [9]:
best_params = {
    'n_neighbors': 49,
    'p': 1.45
}
model = KNeighborsRegressor(weights='distance', **best_params)
model.fit(X_train, y_train)
np.sqrt(mean_squared_error(model.predict(X_valid), y_valid))

5993.289878925728


### Other metric models' performance

#### Nearest Centroid

In [10]:
model = NearestCentroid()
model.fit(X_train, y_train)
np.sqrt(mean_squared_error(model.predict(X_valid), y_valid))

15779.011850651174


Nearest Cerntroid works worse than KNN, and has no parameters, so nothing there to tune.

#### Basic RadiusNeighbor performance

In [11]:
model = RadiusNeighborsRegressor(weights='distance')
model.fit(X_train, y_train)
np.sqrt(mean_squared_error(model.predict(X_valid), y_valid))

6740.059402108933


  multiarray.copyto(res, fill_value, casting='unsafe')


Basic RN is better than basic KNN

#### Parameters tuning for RN

In [13]:
# model = RadiusNeighborsRegressor(weights='distance')
# params = {
#     'radius': np.linspace(0.01, 5, 30),
#     'p': np.linspace(1, 5, 10)
# }
# search = GridSearchCV(model, params, cv=4, scoring='neg_mean_squared_error')
# search.fit(X, y)

  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarray.copyto(res, fill_value, casting='unsafe')
  multiarr

In [14]:
# print(search.best_params_)
# print(np.sqrt(-search.best_score_))

{'p': 1.0, 'radius': 0.01}
8005.980279139218


In [15]:
best_params = {
    'radius': 0.01,
    'p': 1.0
}
model = RadiusNeighborsRegressor(weights='distance', **best_params)
model.fit(X_train, y_train)
np.sqrt(mean_squared_error(model.predict(X_valid), y_valid))

6740.059402108933


  multiarray.copyto(res, fill_value, casting='unsafe')


Despite better performance with default parameters, tuned RN has worse RMSE score, than tuned KNN