# Explore simple models

In [1]:
%matplotlib inline
from utility_funcitons import *


In [2]:
from feature_selection import *
import pandas as pd 
import numpy as np
from utility_funcitons import *


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_features, test_features, feature_names = featurize(train, test, 'tfidf_glove')



Starts with number....
Clickbait Phrases....
Clickbait re....
Num dots....
Text Features....
Punctuation....
Word ratios....
Sentiment Scores....
Readability Scores....


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))


Glove.....


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))


DONE!


In [3]:
y_train = np.where(train.label.values == 'clickbait', 1, 0)
y_test = np.where(test.label.values == 'clickbait', 1, 0)

In [6]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import make_scorer
from scipy import sparse


def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


def adjusted_f1(y_true, y_prob):
    f1 = print_model_metrics(y_true, y_prob, verbose = 0, return_metrics = True)[0]
    return f1

score = make_scorer(adjusted_f1, greater_is_better = True, needs_proba = True)



# Since we want to use a predefined Test/Val set, we'll use PredefinedSplit and pass it as the CV parameter
# We need to merge both the datasets and label 0 for test and -1 for the train set

X = sparse.vstack((train_features, test_features))
test_fold = [-1 for _ in range(train_features.shape[0])] + [0 for _ in range(test_features.shape[0])]
y = np.concatenate([y_train, y_test])
ps = PredefinedSplit(test_fold)

def run_grid_search(model, params, x_train, y_train):
    grid = GridSearchCV(model, params, cv = ps, n_jobs = -1, scoring = score, verbose = 0, refit = False)
    grid.fit(x_train, y_train)
    return (grid.best_params_, grid.best_score_)


In [7]:
def fit_n_times(model, x_train, y_train, x_test, y_test, n_iters = 10):
    metrics = np.zeros(5)
    for _ in range(n_iters):
        model.fit(x_train, y_train)
        y_test_prob = model.predict_proba(x_test)[:,1]
        metrics += print_model_metrics(y_test, y_test_prob, verbose = False, return_metrics = True)
    metrics /=10
    print('F1: {:.3f} | Pr: {:.3f} | Re: {:.3f} | AUC: {:.3f} | Accuracy: {:.3f} \n'.format(*metrics))



### Logistic Regression


In [8]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss = 'log')
lr_params = {'alpha' : [10**(-x) for x in range(7)],
             'penalty' : ['l1', 'l2', 'elasticnet'],
             'l1_ratio' : [0.15, 0.25, 0.5, 0.75]}

best_params, best_f1 = run_grid_search(lr, lr_params, X, y)

print('Best Parameters : {}'.format(best_params))

lr = SGDClassifier(loss = 'log', 
                   alpha = best_params['alpha'], 
                   penalty = best_params['penalty'], 
                   l1_ratio = best_params['l1_ratio'])
fit_n_times(lr, train_features, y_train, test_features, y_test)


Best Parameters : {'alpha': 0.0001, 'l1_ratio': 0.5, 'penalty': 'l2'}
F1: 0.987 | Pr: 0.985 | Re: 0.988 | AUC: 0.999 | Accuracy: 0.987 



## SVM

In [11]:
from sklearn.svm import SVC

svm = SVC(probability = True)
svm_params = {'C' : [10**(x) for x in range(-1,4)],
             'kernel' : ['poly', 'rbf', 'linear'],
             'degree' : [2, 3]}

best_params, best_f1 = run_grid_search(svm, svm_params, X, y)

print('Best Parameters : {}'.format(best_params))
print('Best F1 : {}'.format(best_f1))

svm = SVC(C = best_params['C'], kernel = best_params['kernel'], degree = best_params['degree'], probability = True)
fit_n_times(svm, train_features, y_train, test_features, y_test)


Best Parameters : {'C': 10, 'degree': 2, 'kernel': 'rbf'}
Best F1 : 0.9898596333483191
F1: 0.990 | Pr: 0.988 | Re: 0.991 | AUC: 0.999 | Accuracy: 0.990 



## Naive Bayes

In [13]:


from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(class_prior = [0.5, 0.5])
nb_params = {'alpha' : [10**(x) for x in range(6)]}


best_params, best_f1 = run_grid_search(nb, nb_params, X, y)

print('Best Parameters : {}'.format(best_params))
print('Best F1 : {}'.format(best_f1))

nb = MultinomialNB(alpha = best_params['alpha'], class_prior = [0.5, 0.5])

fit_n_times(nb, train_features, y_train, test_features, y_test)



Best Parameters : {'alpha': 100000}
Best F1 : 0.9467467368107598
F1: 0.947 | Pr: 0.939 | Re: 0.954 | AUC: 0.988 | Accuracy: 0.946 



## K-Neighbors Classifier

In [1]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs = -1)

knn_params = { 'n_neighbors' : [3, 5, 7, 9, 15, 31], 
               'weights' : ['uniform', 'distance']
}

best_params, best_f1 = run_grid_search(knn, knn_params, X, y)
print('Best Parameters : {}'.format(best_params))

knn = KNeighborsClassifier(n_neighbors = best_params['n_neighbors'], weights = best_params['weights'], n_jobs = -1)

fit_n_times(knn, train_features, y_train, test_features, y_test)


Starts with number....
Clickbait Phrases....
Clickbait re....
Num dots....
Text Features....
Punctuation....
Word ratios....
Sentiment Scores....
Readability Scores....
HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))

Glove.....
HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))

DONE!
Fitting 1 folds for each of 12 candidates, totalling 12 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  4.2min finished
Best Parameters : {'n_neighbors': 7, 'weights': 'distance'}
F1: 0.984 | Pr: 0.980 | Re: 0.988 | AUC: 0.994 | Accuracy: 0.984 



## Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs = -1)

rf_params = { 'n_estimators' : [10, 100, 250, 500, 1000], 
               'max_depth' : [None, 3, 7, 15],
               'min_samples_split' : [2, 5, 15]
}

best_params, best_f1 = run_grid_search(rf, rf_params, X, y)

print('Best Parameters : {}'.format(best_params))
rf = RandomForestClassifier(n_estimators = best_params['n_estimators'],
                            min_samples_split = best_params['min_samples_split'],
                            max_depth = best_params['max_depth'], 
                            n_jobs = -1)
fit_n_times(rf, train_features, y_train, test_features, y_test)

Best Parameters : {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
F1: 0.982 | Pr: 0.979 | Re: 0.984 | AUC: 0.998 | Accuracy: 0.982 



## XGBoost

In [6]:

from xgboost import XGBClassifier

xgb = XGBClassifier(n_jobs = -1)

xgb_params = { 'n_estimators' : [10, 100, 200, 500], 
               'max_depth' : [1, 2, 3, 7],
               'learning_rate' : [0.1, 0.2, 0.01, 0.3],
               'reg_alpha' : [0, 0.1, 0.2]
}

best_params, best_f1 = run_grid_search(xgb, xgb_params, X, y)

print('Best Parameters : {}'.format(best_params))

xgb = XGBClassifier(n_estimators = best_params['n_estimators'],
                            learning_rate = best_params['learning_rate'],
                            max_depth = best_params['max_depth'], 
                            reg_alpha = best_params['reg_alpha'], n_jobs = -1)

fit_n_times(xgb, train_features.todense(), y_train, test_features.todense(), y_test)

Best Parameters : {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 500, 'reg_alpha': 0}
F1: 0.986 | Pr: 0.982 | Re: 0.991 | AUC: 0.999 | Accuracy: 0.986 

