In [1]:
import pandas as pd
import numpy as np
import re
import time
import bs4 as bs4
import json
import glob
import tqdm

pd.set_option('max.columns',131)
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib


# 1.0 - Active Learning Results

In [4]:
df1 = pd.read_csv('./raw_with_label_old.csv',index_col=0)
df1 = df1[df1['y'].notnull()]
df1.shape

(498, 16)

In [6]:
df2 = pd.read_csv('./active_labels1_done.csv',index_col=0)
df2 = df2[df2['y'].notnull()]
df2['novo'] = 1
df2.shape

(100, 18)

In [7]:
from sklearn.metrics import average_precision_score, roc_auc_score

In [8]:
average_precision_score(df2['y'],df2['p']), roc_auc_score(df2['y'],df2['p'])

(0.2037344613689981, 0.5386250885896527)

In [13]:
df = pd.concat([df1,df2.drop('p',axis=1)],sort=True)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [17]:
df_clean = pd.DataFrame(index=df.index)
df_clean['title'] = df['watch-title']
df_clean['new'] = df['novo'].fillna(0)

# 2.0 - Data Cleaning

In [19]:
## Clean Watch Time to DATE
clean_date = df['watch-time-text'].str.extract(r'(\d+) de ([a-z]+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x)==1 else x)

month_map = {
    'jan':'Jan',
    'fev':'Feb',
    'mar':'Mar',
    'abr':'Apr',
    'mai':'May',
    'jun':'Jun',
    'jul':'Jul',
    'ago':'Aug',
    'set':'Sep',
    'out':'Oct',
    'nov':'Nov',
    'dez':'Dec'
}
clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [21]:
## Clean View Count
views = df['watch-view-count'].str.extract(r'(\d+\.?\d*)', expand=False).str.replace('.','').fillna(0).astype(int)
df_clean['views'] = views

# 3.0 - Features

In [22]:
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

In [23]:
features['time_since_pub'] = (pd.to_datetime('2019-12-03') - df_clean['date'])/np.timedelta64(1,'D')
features['views'] = df_clean.views
features['views_per_day'] = features.views/features.time_since_pub
features.drop(['time_since_pub'],axis=1,inplace=True)

In [25]:
features.head(2)

Unnamed: 0,views,views_per_day
0,28028,61.464912
1,1131,2.960733


## 3.1 - Increase Validation

In [26]:
mask_train = (df_clean.date<'2019-04-01') & (df_clean.new==0)
mask_val = (df_clean.date>='2019-04-01')

In [44]:
Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]

In [29]:
Xtrain.shape,Xval.shape,ytrain.shape,yval.shape

((228, 2), (316, 2), (228, 2), (316, 2))

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
title_train = df_clean[mask_train].title
title_val = df_clean[mask_val].title

In [32]:
title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [33]:
title_bow_train.shape

(228, 193)

In [34]:
from scipy.sparse import hstack, vstack

In [51]:
Xtrain_wtitle = hstack([Xtrain,title_bow_train])
Xval_wtitle = hstack([Xval,title_bow_val])

In [52]:
Xtrain_wtitle.shape, Xval_wstitle.shape

((228, 195), (316, 195))

In [53]:
mdl = RandomForestClassifier(n_estimators=1000,random_state=0, class_weight='balanced')
mdl.fit(Xtrain_wtitle,ytrain)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [54]:
p = mdl.predict_proba(Xval_wtitle)[:,1]

In [56]:
average_precision_score(yval,p), roc_auc_score(yval,p)

(0.1872802752830275, 0.5935436218282933)