In [48]:
import pandas as pd
import numpy as np
import re
import time
import bs4 as bs4
import json
import glob
import tqdm

pd.set_option('max.columns',131)
%matplotlib inline
%pylab inline

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [49]:
df = pd.read_csv('./data_labels_old.csv',index_col=0).dropna(subset=['y'])

In [50]:
df.shape

(1164, 16)

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 1.0 - Data Cleaning

In [52]:
df_clean = pd.DataFrame(index=df.index)
df_clean['title'] = df['watch-title']

In [53]:
clean_date = df['watch-time-text'].str.extract(r'(\d+) de ([a-z]+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

month_map = {
    'jan':'Jan',
    'fev':'Feb',
    'mar':'Mar',
    'abr':'Apr',
    'mai':'May',
    'jun':'Jun',
    'jul':'Jul',
    'ago':'Aug',
    'set':'Sep',
    'out':'Oct',
    'nov':'Nov',
    'dez':'Dec'
}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [54]:
## Clean View Count
views = df['watch-view-count'].str.extract(r'(\d+\.?\d*)', expand=False).str.replace('.','').fillna(0).astype(int)
df_clean['views'] = views

# 2.0 - Features

In [55]:
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

In [56]:
features['time_since_pub'] = (pd.to_datetime('2019-12-03') - df_clean['date'])/np.timedelta64(1,'D')
features['views'] = df_clean.views
features['views_per_day'] = features.views/features.time_since_pub
features.drop(['time_since_pub'],axis=1,inplace=True)

In [57]:
features.shape

(1164, 2)

In [58]:
features.head()

Unnamed: 0,views,views_per_day
0,28028,61.464912
394,1161,21.109091
393,141646,809.405714
392,325,21.666667
391,61,7.625


In [59]:
mask_train = df_clean.date <'2019-04-01'
mask_val = df_clean.date>='2019-04-01'

In [60]:
Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((555, 2), (609, 2), (555,), (609,))

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
title_train = df_clean[mask_train].title
title_val = df_clean[mask_val].title

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [63]:
from scipy.sparse import hstack, vstack

In [64]:
title_bow_train.shape

(555, 1144)

In [65]:
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [66]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((555, 1146), (609, 1146))

In [67]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight='balanced')
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [68]:
p = mdl.predict_proba(Xval_wtitle)[:,1]

In [69]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [70]:
print('Results w/ min_samples_leaf=1, 1k trees and ngram_range=(1,3)\n')
print('ap:',average_precision_score(yval,p))
print('auc:',roc_auc_score(yval,p))

Results w/ min_samples_leaf=1, 1k trees and ngram_range=(1,3)

ap: 0.22228951304206077
auc: 0.6914990859232175


# LightGBM

In [71]:
from lightgbm import LGBMClassifier

In [72]:
from scipy.sparse import hstack, vstack

In [73]:
mdl = LGBMClassifier(random_state=0, class_weight='balanced')
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
               objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [74]:
p = mdl.predict_proba(Xval_wtitle)[:,1]

In [75]:
print('ap:',average_precision_score(yval,p))
print('auc:',roc_auc_score(yval,p))

ap: 0.17930936519845178
auc: 0.6177389868490889


**Probably tunning improves the results**

## Bayesian Optimization

In [76]:
from skopt import forest_minimize

In [77]:
def tune_lgbm(args):
    ## Model Parameters
    print(args)
    lr = args[0]
    max_depth = args[1]
    min_child_samples = args[2]
    subsample = args[3]
    colsample_bytree = args[4]
    n_estimators = args[5]
    
    ## Tfidf Vec Parameters
    min_df = args[6]
    ngram_range = (1,args[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                        random_state=0, class_weight='balanced', n_jobs=6)
    
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:,1]
    
    print(roc_auc_score(yval,p))
    
    return -average_precision_score(yval,p) ## To maximize ap, return the negative to the minimize func



In [78]:
space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]
0.6016099545910243
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.6894
Function value obtained: -0.1404
Current minimum: -0.1404
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]
0.5507312614259597
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.3842
Function value obtained: -0.1203
Current minimum: -0.1404
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]
0.587884059680368
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.5748
Function value obtained: -0.1487
Current minimum: -0.1487
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.68

In [79]:
res.x

[0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 5]

### LGBM 
**AP = 0.2378, AUC = 0.6883293035324645**

Parameters:
[0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 5]

## Logistic Reg

In [80]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [130]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
scaler = MaxAbsScaler()

#Xtrain_wtitle2[:,:2] = scaler.fit_transform(Xtrain_wtitle2[:,:2].todense())
#Xval_wtitle2[:,:2] = scaler.transform(Xval_wtitle2[:,:2].todense())

Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
Xval_wtitle2 = scaler.transform(Xval_wtitle2)

In [131]:
Xval_wtitle2.shape

(609, 1146)

In [183]:
mdl = LogisticRegression(n_jobs=6, random_state=0, C=.051)
mdl.fit(Xtrain_wtitle2, ytrain)

LogisticRegression(C=0.051, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=6, penalty='l2', random_state=0,
                   solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [184]:
p = mdl.predict_proba(Xval_wtitle2)[:,1]

In [185]:
average_precision_score(yval,p), roc_auc_score(yval,p)

(0.22366713615797784, 0.6566314796249337)

## Results:

Logistic Regression - StandardScaler - Without tunning  
ap/auc = (0.20905963592265248, 0.6619095358848853)

Logistic Regression - MaxAbsScaler - Without tunning  
ap/auc = (0.20635826806997412, 0.6864716636197441)

Logistic Regression - MaxAbsScaler - C = .051  
ap/auc = (0.22366713615797784, 0.6566314796249337)