In [1]:
import pandas as pd
import numpy as np
import re
import time
import bs4 as bs4
import json
import glob
import tqdm

pd.set_option('max.columns',131)
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [3]:
df = pd.read_csv('./raw_with_label_old.csv',index_col=0)
df = df[df.y.notnull()]

In [4]:
df.shape

(498, 16)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 1.0 - Data Cleaning

In [7]:
## Create DF to store clean values
df2 = pd.DataFrame(index=df.index)

In [8]:
df2['title'] = df['watch-title']

In [9]:
## Clean Watch Time to DATE
clean_date = df['watch-time-text'].str.extract(r'(\d+) de ([a-z]+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x)==1 else x)

month_map = {
    'jan':'Jan',
    'fev':'Feb',
    'mar':'Mar',
    'abr':'Apr',
    'mai':'May',
    'jun':'Jun',
    'jul':'Jul',
    'ago':'Aug',
    'set':'Sep',
    'out':'Oct',
    'nov':'Nov',
    'dez':'Dec'
}
clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df2['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [10]:
## Clean View Count
views = df['watch-view-count'].str.extract(r'(\d+\.?\d*)', expand=False).str.replace('.','').fillna(0).astype(int)
df2['views'] = views

# 2.0 - Features

In [11]:
features = pd.DataFrame(index=df2.index)
y = df['y'].copy()

In [12]:
features['time_since_pub'] = (pd.to_datetime('2019-12-03') - df2['date'])/np.timedelta64(1,'D')
features['views'] = df2.views
features['views_per_day'] = features.views/features.time_since_pub
features.drop(['time_since_pub'],axis=1,inplace=True)

In [16]:
mask_train = df2['date'] < "2019-04-01"
mask_val = df2['date'] >= "2019-04-01"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]

In [17]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((228, 2), (270, 2), (228,), (270,))

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
title_train = df2[mask_train].title
title_val = df2[mask_val].title

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [23]:
title_bow_train.shape

(228, 193)

In [24]:
from scipy.sparse import hstack, vstack

In [25]:
Xtrain_wtitle = hstack([Xtrain,title_bow_train])
Xval_wtitle = hstack([Xval,title_bow_val])

In [27]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((228, 195), (270, 195))

In [34]:
mdl = RandomForestClassifier(n_estimators=1000,random_state=0, class_weight='balanced')
mdl.fit(Xtrain_wtitle,ytrain)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [36]:
p = mdl.predict_proba(Xval_wtitle)[:,1]

In [37]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [38]:
average_precision_score(yval, p)

0.1918043901336543

In [39]:
roc_auc_score(yval,p)

0.5848024316109421

# 3.0 - Active Learning

In [40]:
df_unlabedlabed = pd.read_csv('./raw_with_label_old.csv', index_col=0)
df_unlabed = df_unlabed[df_unlabed['y'].isnull()].dropna(how='all')
df_unlabed.shape

(674, 16)

In [42]:
df_unlabed.head(2)

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0
501,Kaggle Mercari Price Suggestion Challenge (1 p...,,2.167 visualizações,Publicado em 2 de nov. de 2018,Educação,Kaggle Mercari Price Suggestion Challenge (1 p...,ML Trainings\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarreg...,2.167 visualizações\n\n\n\n\n\n\n\n61\n\nGosto...,https://i.ytimg.com/vi/QFR0IHbzA30/maxresdefau...,1280.0,720.0,Pawel Jankiewicz and Konstantin Lopuhin share ...,1280.0,720.0,price suggestion,/channel/UCeq6ZIlvC9SVsfhfKnSvM9w
502,OpenAI Gym and Python for Q-learning - Reinfor...,,20.378 visualizações,Publicado em 14 de out. de 2018,Educação,OpenAI Gym and Python for Q-learning - Reinfor...,deeplizard\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarregan...,20.378 visualizações\n\n\n\n\n\n\n\n397\n\nGos...,https://i.ytimg.com/vi/QK_PP_2KgGE/maxresdefau...,1280.0,720.0,Welcome back to this series on reinforcement l...,1280.0,720.0,experience replay,/channel/UC4UJ26WkceqONNF5S26OiVw


In [43]:
df_clean_u = pd.DataFrame(index=df_unlabed.index)
df_clean_u['title'] = df_unlabed['watch-title']

In [45]:
## Clean Watch Time to DATE
clean_date = df_unlabed['watch-time-text'].str.extract(r'(\d+) de ([a-z]+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x)==1 else x)

month_map = {
    'jan':'Jan',
    'fev':'Feb',
    'mar':'Mar',
    'abr':'Apr',
    'mai':'May',
    'jun':'Jun',
    'jul':'Jul',
    'ago':'Aug',
    'set':'Sep',
    'out':'Oct',
    'nov':'Nov',
    'dez':'Dec'
}
clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean_u['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [47]:
## Clean View Count
views = df_unlabed['watch-view-count'].str.extract(r'(\d+\.?\d*)', expand=False).str.replace('.','').fillna(0).astype(int)
df_clean_u['views'] = views

In [52]:
features_u = pd.DataFrame(index=df_clean_u.index)

In [53]:
features_u['time_since_pub'] = (pd.to_datetime('2019-12-03') - df_clean_u['date'])/np.timedelta64(1,'D')
features_u['views'] = df_clean_u.views
features_u['views_per_day'] = features_u.views/features_u.time_since_pub
features_u.drop(['time_since_pub'],axis=1,inplace=True)

In [50]:
title_u = df_clean_u['title']
title_bow_u = title_vec.transform(title_u)

In [59]:
Xu_wtitle = hstack([features_u,title_bow_u])

In [62]:
pu = mdl.predict_proba(Xu_wtitle)[:,1]

In [63]:
df_unlabed['p'] = pu

In [64]:
df_unlabed.head(2)

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p
501,Kaggle Mercari Price Suggestion Challenge (1 p...,,2.167 visualizações,Publicado em 2 de nov. de 2018,Educação,Kaggle Mercari Price Suggestion Challenge (1 p...,ML Trainings\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarreg...,2.167 visualizações\n\n\n\n\n\n\n\n61\n\nGosto...,https://i.ytimg.com/vi/QFR0IHbzA30/maxresdefau...,1280.0,720.0,Pawel Jankiewicz and Konstantin Lopuhin share ...,1280.0,720.0,price suggestion,/channel/UCeq6ZIlvC9SVsfhfKnSvM9w,0.093
502,OpenAI Gym and Python for Q-learning - Reinfor...,,20.378 visualizações,Publicado em 14 de out. de 2018,Educação,OpenAI Gym and Python for Q-learning - Reinfor...,deeplizard\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarregan...,20.378 visualizações\n\n\n\n\n\n\n\n397\n\nGos...,https://i.ytimg.com/vi/QK_PP_2KgGE/maxresdefau...,1280.0,720.0,Welcome back to this series on reinforcement l...,1280.0,720.0,experience replay,/channel/UC4UJ26WkceqONNF5S26OiVw,0.037


In [74]:
mask_u = ((df_unlabed.p >=0.26) & (df_unlabed.p<=1))

In [84]:
hard = df_unlabed[mask_u]
random = df_unlabed[~mask_u].sample(31)

In [86]:
pd.concat([hard,random]).to_csv('./active_label.csv')