In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

%load_ext autoreload
%autoreload 2

In [2]:
from helpers import async_cache_pages, urls_list
from features import construct_structural_features, feature_functions

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [4]:
seed = 0

In [5]:
folder_train = ['train-0']
folder_test_easy = ['test-easy']
folder_test_hard = ['test-hard']

In [6]:
train_pos, train_neg = urls_list(folder_train)

test_pos_easy, test_neg_easy = urls_list(folder_test_easy)
test_pos_hard, test_neg_hard = urls_list(folder_test_hard)

async_cache_pages(train_pos + train_neg + test_pos_easy + test_neg_easy + test_pos_hard + test_neg_hard)

Error while downloading: https://www.economist.com/digital. Status code: 403
Error while downloading: http://blog.openclassrooms.com/en/. Status code: 403
Trying again to download 0 pages one at a time


In [7]:
all_features = sorted(list(feature_functions.keys()))

In [8]:
def balanced_classes(df):
    n_pos = sum(df.label == True)
    n_neg = sum(df.label == False)

    min_pos_neg = min(n_pos, n_neg)


    return pd.concat([
            df[df.label == False].sample(min_pos_neg, random_state=seed), 
            df[df.label == True].sample(min_pos_neg, random_state=seed)
    ])

In [9]:
train_urls = train_pos + train_neg
train_labels = [True] * len(train_pos) + [False] * len(train_neg)

train_df = construct_structural_features(train_urls, train_labels, all_features)

train_df = balanced_classes(train_df)

In [10]:
test_easy_urls = test_pos_easy + test_neg_easy
test_easy_labels = [True] * len(test_pos_easy) + [False] * len(test_neg_easy)

test_easy_df = construct_structural_features(test_easy_urls, test_easy_labels, all_features)

test_hard_urls = test_pos_hard + test_neg_hard
test_hard_labels = [True] * len(test_pos_hard) + [False] * len(test_neg_hard)

test_hard_df = construct_structural_features(test_hard_urls, test_hard_labels, all_features)


In [11]:
#estim = RandomForestClassifier()
estim = LogisticRegression(random_state=seed, max_iter=1000)

X = train_df[all_features]
y = train_df.label

In [12]:
print("Training the model")
estim.fit(X, y)

print("Predictions on test sets")
y_easy_pred = estim.predict(test_easy_df[all_features])
y_easy_test = test_easy_df.label

y_hard_pred = estim.predict(test_hard_df[all_features])
y_hard_test = test_hard_df.label

print("==Easy task==")
print(metrics.classification_report(y_easy_test, y_easy_pred))
print(metrics.confusion_matrix(y_easy_test, y_easy_pred))
easy_accuracy = metrics.accuracy_score(y_easy_test, y_easy_pred)
print("Accuracy: {}".format(easy_accuracy))

print("==Hard task==")
print(metrics.classification_report(y_hard_test, y_hard_pred))
print(metrics.confusion_matrix(y_hard_test, y_hard_pred))
hard_accuracy = metrics.accuracy_score(y_hard_test, y_hard_pred)
print("Accuracy: {}".format(hard_accuracy))

Training the model
Predictions on test sets
==Easy task==
             precision    recall  f1-score   support

      False       0.57      0.62      0.59        48
       True       0.61      0.55      0.58        51

avg / total       0.59      0.59      0.59        99

[[30 18]
 [23 28]]
Accuracy: 0.5858585858585859
==Hard task==
             precision    recall  f1-score   support

      False       0.60      0.78      0.68        45
       True       0.74      0.55      0.63        51

avg / total       0.67      0.66      0.65        96

[[35 10]
 [23 28]]
Accuracy: 0.65625


--> The structural features were not working so well, we were overfitting by using websites from the same platforms during training + test. Now it is the actual accuracy