In [25]:
import pandas as pd
import numpy as np
import re
import time
import bs4 as bs4
import json
import glob
import tqdm

pd.set_option('max.columns',131)
%matplotlib inline
%pylab inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

from lightgbm import LGBMClassifier


Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('./data_labels_old.csv',index_col=0).dropna(subset=['y'])

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df.shape

(1164, 16)

In [5]:
df_clean = pd.DataFrame(index=df.index)
df_clean['title'] = df['watch-title']

In [6]:
clean_date = df['watch-time-text'].str.extract(r'(\d+) de ([a-z]+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

month_map = {
    'jan':'Jan',
    'fev':'Feb',
    'mar':'Mar',
    'abr':'Apr',
    'mai':'May',
    'jun':'Jun',
    'jul':'Jul',
    'ago':'Aug',
    'set':'Sep',
    'out':'Oct',
    'nov':'Nov',
    'dez':'Dec'
}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [7]:
## Clean View Count
views = df['watch-view-count'].str.extract(r'(\d+\.?\d*)', expand=False).str.replace('.','').fillna(0).astype(int)
df_clean['views'] = views

In [8]:
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

In [9]:
features['time_since_pub'] = (pd.to_datetime('2019-12-03') - df_clean['date'])/np.timedelta64(1,'D')
features['views'] = df_clean.views
features['views_per_day'] = features.views/features.time_since_pub
features.drop(['time_since_pub'],axis=1,inplace=True)

In [10]:
mask_train = df_clean.date <'2019-04-01'
mask_val = df_clean.date>='2019-04-01'

In [11]:
Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((555, 2), (609, 2), (555,), (609,))

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
title_train = df_clean[mask_train].title
title_val = df_clean[mask_val].title

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [42]:
from scipy.sparse import hstack, vstack

In [43]:
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [44]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((555, 1146), (609, 1146))

# Random Forest

In [45]:
mdl_rf = RandomForestClassifier(n_estimators=1000,random_state=0, min_samples_leaf=1, class_weight='balanced')
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [46]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:,1]

In [47]:
average_precision_score(yval,p_rf), roc_auc_score(yval,p_rf)

(0.22228951304206077, 0.6914990859232175)

# LGBM

In [48]:
args = [0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 3]
lr = args[0]
max_depth = args[1]
min_child_samples = args[2]
subsample = args[3]
colsample_bytree = args[4]
n_estimators = args[5]

min_df = args[6]
ngram_range = (1, args[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                    min_child_samples=min_child_samples, subsample=subsample,
                    colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                    random_state=0, class_weight='balanced', n_jobs=6)

mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:,1]

In [49]:
average_precision_score(yval,p_lgbm), roc_auc_score(yval,p_lgbm)

(0.247808743128664, 0.6717874624049065)

# Logistic Regression

In [50]:
from sklearn.pipeline import make_pipeline

In [51]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
#scaler = MaxAbsScaler()

#Xtrain_wtitle2[:,:2] = scaler.fit_transform(Xtrain_wtitle2[:,:2].todense())
#Xval_wtitle2[:,:2] = scaler.transform(Xval_wtitle2[:,:2].todense())

#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2', n_jobs=6,random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

Pipeline(memory=None,
         steps=[('maxabsscaler', MaxAbsScaler(copy=True)),
                ('logisticregression',
                 LogisticRegression(C=0.5, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=6, penalty='l2',
                                    random_state=0, solver='warn', tol=0.0001,
                                    verbose=0, warm_start=False))],
         verbose=False)

In [52]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:,1]

In [53]:
average_precision_score(yval,p_lr), roc_auc_score(yval,p_lr)

(0.2122903202306507, 0.683699946924574)

## Scores

RF - (0.22228951304206077, 0.6914990859232175)  
LGBM - (0.23779186526938, 0.6883293035324645)  
LR - (0.2130656734216374, 0.6834345697941853)  

# Ensemble

In [54]:
## simple mean
p = (p_lr+p_rf+p_lgbm)/3

average_precision_score(yval,p), roc_auc_score(yval,p)

(0.2430446183895379, 0.6943445184879401)

In [55]:
pd.DataFrame({'LR':p_lr,'RF':p_rf, 'LGBM':p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.808196,0.471786
RF,0.808196,1.0,0.486359
LGBM,0.471786,0.486359,1.0


**Low correlation value for models with similar performances indicates that the ensemble should improve the performance**

In [57]:
## weighted mean

p = 0.4*p_rf + 0.6*p_lgbm

average_precision_score(yval,p), roc_auc_score(yval,p)

(0.24809974466463763, 0.690865129445067)

## Save Models

In [58]:
import joblib as jb

In [59]:
jb.dump(mdl_lgbm, 'lgbm_20200208.pkl.z')
jb.dump(mdl_rf, 'random_forest_20200208.pkl.z')
jb.dump(title_vec, 'title_vectorizer_20200208.pkl.z')

['title_vectorizer_20200208.pkl.z']