In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, auc

In [2]:
df = pd.read_csv('./OnlineNewsPopularity/OnlineNewsPopularity.csv')

In [3]:
df.shape

(39644, 61)

In [4]:
df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


In [5]:
df.columns = [col.strip() for col in df.columns]

In [6]:
med = df['shares'].median()
df['label'] = df['shares'].apply(lambda share: 1 if share > med else 0)

In [7]:
X = df.drop(columns=['url', 'label', 'shares'])
y = df['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
# X = df.drop(columns=['url', 'label', 'shares'])
X = df
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.to_csv('./web_data/X_train.csv', header=True, index=True)
X_test.to_csv('./web_data/X_test.csv', header=True, index=True)

y_train.to_csv('./web_data/y_train.csv', header=True, index=True)
y_test.to_csv('./web_data/y_test.csv', header=True, index=True)

X_train = X_train.drop(columns=['url', 'label', 'shares'])
y_train = y_train.drop(columns=['url', 'label', 'shares'])

## Support Vector Machines

In [None]:
# SVM
parameters = {'C': [2**i for i in range(0, 7)],
              'kernel':('rbf', ), }
tmp_model = svm.SVC()
svc = GridSearchCV(tmp_model, parameters, cv=3, verbose=10, n_jobs=2)
svc.fit(X, y)
svc_best_params_ = svc.best_params_
svc_best_score_ = svc.best_score_
print('SVM', svc_best_params_, svc_best_score_)

In [None]:
# SVC
svc_best_params_ = {'C': 2**1, 'kernel': 'linear'}
svc = svm.SVC(**svc_best_params_)
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
y_true = y_test
print('SVC')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))
# print(auc(y_true, y_pred))

In [None]:
y_pred

In [None]:
# Make output file for webservice
X_test.join(y_pred)

## Random Forest

In [26]:
# Random Forest
parameters = {'criterion': ('gini', 'entropy'),
             'n_estimators': [10, 20, 50, 100, 200, 400]}
tmp_model = RandomForestClassifier()
rf_model = GridSearchCV(tmp_model, parameters, cv=3, verbose=10, n_jobs=2)
rf_model.fit(X, y)
rf_best_score_ = rf_model.best_score_
rf_best_params_ = rf_model.best_params_
print('Random forest', rf_best_params_, rf_best_score_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    4.1s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    8.8s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   29.6s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  5.3min
[Parallel(n_jobs=2)]: Done  36 out of  36 | elapsed: 10.3min finished


Random forest {'criterion': 'entropy', 'n_estimators': 10} 0.5623297346382807


In [27]:
# Random Forest
rf_model = RandomForestClassifier(**rf_best_params_)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
y_true = y_test
print('Random forest')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))
# print(auc(y_true, y_pred))


Random forest
[[4750 1920]
 [3013 3400]]
0.6229458075364978
0.6390977443609023
0.5301730859192265
0.5795619193727095


## Ada Boost

In [8]:
# Ada Boost
parameters = {'n_estimators': [10, 20, 50, 100, 200, 400]}
tmp_model = AdaBoostClassifier()
ab_model = GridSearchCV(tmp_model, parameters, cv=3, verbose=10, n_jobs=2)
ab_model.fit(X, y)
ab_best_score_ = ab_model.best_score_
ab_best_params_ = ab_model.best_params_
print('Ada Boost', ab_best_params_, ab_best_score_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    2.3s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    3.6s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    7.5s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   11.7s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   19.8s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   28.5s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   42.6s
[Parallel(n_jobs=2)]: Done  45 out of  45 | elapsed:   57.4s finished


Ada Boost {'n_estimators': 10} 0.548683281202704


In [22]:
# AdaBoostClassifier
ab_best_params_ = {'n_estimators': 10}
ab_model = AdaBoostClassifier(**ab_best_params_)
ab_model.fit(X_train, y_train)

y_pred = ab_model.predict(X_test)
y_true = y_test
print('AdaBoostClassifier')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))
# print(auc(y_true, y_pred))

AdaBoostClassifier
[[4264 2406]
 [2237 4176]]
0.6451119773752197
0.6344576116681859
0.6511772961172618
0.642708734128511


## K Nearest Neighbors

In [14]:
# KNN
parameters = {'metric': ('euclidean', ),
             'n_neighbors': [1, 3, 5, 10, 20]}
tmp_model = neighbors.KNeighborsClassifier()
knn_model = GridSearchCV(tmp_model, parameters, cv=3, verbose=10, n_jobs=2)
knn_model.fit(X, y)
knn_best_params_ = knn_model.best_params_
knn_best_score_ = knn_model.best_score_
print('KNN', knn_best_params_, knn_best_score_)

Fitting 3 folds for each of 39 candidates, totalling 117 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   26.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  2.7min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  4.6min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  7.1min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 10.3min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 14.9min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 19.1min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed: 24.4min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed: 29.2min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed: 34.5min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed: 41.2min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed: 48.5min
[Parallel(n_jobs=2)]: Done 117 out of 117 | elapsed: 52.1min finished


KNN {'metric': 'euclidean', 'n_neighbors': 35} 0.5703763495106448


In [23]:
# KNN
knn_model = neighbors.KNeighborsClassifier(**knn_best_params_)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)
y_true = y_test
print('KNN')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))
# print(auc(y_true, y_pred))

KNN
[[4106 2564]
 [2893 3520]]
0.5828938316899793
0.5785667324128863
0.548885077186964
0.5633352004481075


## Naive Bayes

In [24]:
# Naive Bayes
print('Naive Bayes: No need to tune',)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)
y_true = y_test
print('Naive Bayes')
print(confusion_matrix(y_true, y_pred))
print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))
# print(auc(y_true, y_pred))

Naive Bayes: No need to tune
Naive Bayes
[[6335  335]
 [5764  649]]
0.5338225177711534
0.6595528455284553
0.10120068610634649
0.17547654454508585
