In [1]:
%%time
import pandas as pd
import numpy as np
import gc
import os
import random
import glob
from tqdm import tqdm
#import tensorflow as tf
import MeCab 
import re
import warnings 
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib_venn import venn2
%matplotlib inline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.precision", 8)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
#     tf.random.set_seed(seed)
    


# seed
seed = 223
seed_everything(seed)

# load train test
train = pd.read_csv('../../data/input/probspace/train_data.csv')
train['y_bin'] = pd.cut(train['y'], [0, 10, 100,1000,10000,100000,1000000,10000000000], labels=[1,2,3,4,5,6,7])
train['y_bin'] = train['y_bin'].astype(int)
test = pd.read_csv('../../data/input/probspace/test_data.csv')
df = pd.concat([train,test],axis=0).reset_index(drop=True)
print ('train',train.shape)
print ('test',test.shape)
df['comments_ratings'] = df['comments_disabled'].astype(str)+df['ratings_disabled'].astype(str)
    
for c in ['channelId','channelTitle','collection_date','description','tags','comments_disabled','ratings_disabled','comments_ratings']:
    lbl = LabelEncoder()
    df[c+'_encoder'] = lbl.fit_transform(df[c].astype(str))    
    
df["c_date"] = "20" + df["collection_date"]
df["c_date"] = pd.to_datetime(df["c_date"], utc=True, format="%Y.%d.%m")
df["c_year"] = df["c_date"].dt.year
df["c_month"] = df["c_date"].dt.month
df["c_day"] = df["c_date"].dt.day
df["c_dayofweek"] = df["c_date"].dt.dayofweek

df["publishedAt"] = pd.to_datetime(df["publishedAt"],utc=True, format="%Y-%m-%d")
df["year"] = df["publishedAt"].dt.year
df["month"] = df["publishedAt"].dt.month
df["weekofyear"] = df["publishedAt"].dt.weekofyear
df["day"] = df["publishedAt"].dt.day
df["dayofweek"] = df["publishedAt"].dt.dayofweek
df["hour"] = df["publishedAt"].dt.hour
df["minute"] = df["publishedAt"].dt.minute    

df['seconds_from_publish'] = (df['c_date'] - df['publishedAt']).dt.seconds
df['days_from_publish'] = (df['c_date'] - df['publishedAt']).dt.days
df['months_from_publish'] = (df['c_date'] - df['publishedAt']).dt.days // 30
df['years_from_publish'] = (df['c_date'] - df['publishedAt']).dt.days // 365

df['days_from_publish_start'] = (df['publishedAt'] - df['publishedAt'].min()).dt.days
df['days_from_cdate_start'] = (df['c_date'] - df['c_date'].min()).dt.days


df['diff_likes_dislikes'] = df['likes'] - df['dislikes'] 
df['ratio_likes_dislikes'] = df['likes'] / (df['dislikes'] + 1)

df['likes_by_day'] = df['likes'] / df['days_from_publish']
df['dislikes_by_day'] = df['dislikes'] / df['days_from_publish']

df['likes_by_month'] = df['likes'] / df['months_from_publish']
df['dislikes_by_month'] = df['dislikes'] / df['months_from_publish']

df['likes_by_year'] = df['likes'] / df['years_from_publish']
df['dislikes_by_year'] = df['dislikes'] / df['years_from_publish']

train (19720, 18)
test (29582, 16)
CPU times: user 2.52 s, sys: 441 ms, total: 2.96 s
Wall time: 4.08 s


In [2]:
%%time
import re
import string
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from scipy import sparse
from scipy.sparse import hstack, csr_matrix
from sklearn.decomposition import NMF,LatentDirichletAllocation,TruncatedSVD
from gensim.sklearn_api.ldamodel import LdaTransformer
from gensim.models import LdaMulticore
from gensim import corpora
from gensim.models import Word2Vec
import unicodedata

class MecabTokenizer:
    def __init__(self):
        self.wakati = MeCab.Tagger('-Owakati')
        self.wakati.parse('')

    def tokenize(self, line):
        txt = self.wakati.parse(line)
        txt = txt.split()
        return txt
    
    def mecab_tokenizer(self, line):
        node = self.wakati.parseToNode(line)
        keywords = []
        while node:
            if node.feature.split(",")[0] == "名詞" or node.feature.split(",")[0] == "形容詞":
                keywords.append(node.surface)
            node = node.next
        return keywords    
    
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


html_tags = ['<p>', '</p>', '<table>', '</table>', '<tr>', '</tr>', '<ul>', '<ol>', '<dl>', '</ul>', '</ol>',
             '</dl>', '<li>', '<dd>', '<dt>', '</li>', '</dd>', '</dt>', '<h1>', '</h1>',
             '<br>', '<br/>', '<strong>', '</strong>', '<span>', '</span>', '<blockquote>', '</blockquote>',
             '<pre>', '</pre>', '<div>', '</div>', '<h2>', '</h2>', '<h3>', '</h3>', '<h4>', '</h4>', '<h5>', '</h5>',
             '<h6>', '</h6>', '<blck>', '<pr>', '<code>', '<th>', '</th>', '<td>', '</td>', '<em>', '</em>']

empty_expressions = ['&lt;', '&gt;', '&amp;', '&nbsp;', 
                     '&emsp;', '&ndash;', '&mdash;', '&ensp;'
                     '&quot;', '&#39;']

other = ['span', 'style', 'href', 'input']


def pre_preprocess(x):
    return str(x).lower()

def rm_spaces(text):
    spaces = ['\u200b', '\u200e', '\u202a', '\u2009', '\u2028', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\u3000', '\x10', '\x7f', '\x9d', '\xad',
              '\x97', '\x9c', '\x8b', '\x81', '\x80', '\x8c', '\x85', '\x92', '\x88', '\x8d', '\x80', '\x8e', '\x9a', '\x94', '\xa0', 
              '\x8f', '\x82', '\x8a', '\x93', '\x90', '\x83', '\x96', '\x9b', '\x9e', '\x99', '\x87', '\x84', '\x9f',
             ]
    for space in spaces:
            text = text.replace(space, ' ')
    return text

def remove_urls(x):
    x = re.sub(r'(https?://[a-zA-Z0-9.-]*)', r'', x)

    # original
    x = re.sub(r'(quote=\w+\s?\w+;?\w+)', r'', x)
    return x

def clean_html_tags(x, stop_words=[]):      
    for r in html_tags:
        x = x.replace(r, '')
    for r in empty_expressions:
        x = x.replace(r, ' ')
    for r in stop_words:
        x = x.replace(r, '')
    return x

def replace_num(text):
    text = re.sub('[0-9]{5,}', '', text)
    text = re.sub('[0-9]{4}', '', text)
    text = re.sub('[0-9]{3}', '', text)
    text = re.sub('[0-9]{2}', '', text)
    return text

def get_url_num(x):
    pattern = "https?://[\w/:%#\$&\?\(\)~\.=\+\-]+"
    urls = re.findall(pattern, x)
    return len(urls)


def clean_puncts(x):
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

#zenkaku = '０,１,２,３,４,５,６,７,８,９,（,）,＊,「,」,［,］,【,】,＜,＞,？,・,＃,＠,＄,％,＝'.split(',')
#hankaku = '0,1,2,3,4,5,6,7,8,9,q,a,z,w,s,x,c,d,e,r,f,v,b,g,t,y,h,n,m,j,u,i,k,l,o,p'.split(',')

def clean_text_jp(x):
    x = x.replace('。', '')
    x = x.replace('、', '')
    x = x.replace('\n', '') # 改行削除
    x = x.replace('\t', '') # タブ削除
    x = x.replace('\r', '')
    x = re.sub(re.compile(r'[!-\/:-@[-`{-~]'), ' ', x) 
    x = re.sub(r'\[math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\[\/math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\\', ' LaTex ', x) # LaTex削除   
    #for r in zenkaku+hankaku:
    #    x = x.replace(str(r), '')
    x = re.sub(' +', ' ', x)
    return x


def preprocess(data):
    data = data.apply(lambda x: pre_preprocess(x))
    data = data.apply(lambda x: rm_spaces(x))
    data = data.apply(lambda x: remove_urls(x))
    data = data.apply(lambda x: clean_puncts(x))
    data = data.apply(lambda x: replace_num(x))
    data = data.apply(lambda x: clean_html_tags(x, stop_words=other))
    data = data.apply(lambda x: clean_text_jp(x))
    return data    

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

def is_japanese(string):
    for ch in string:
        try:
            name = unicodedata.name(ch) 
            if "CJK UNIFIED" in name \
            or "HIRAGANA" in name \
            or "KATAKANA" in name:
                return True
        except:
          continue
    return False

stopwords = {x: 1 for x in stopwords.words('english')}
punct = set(string.punctuation)

df['new_tags'] = df['tags'].astype(str).apply(lambda x: x.replace('|',' '))
df['all_text'] =  (df['channelTitle'].fillna('') + ' ' + df['description'].fillna('') + ' ' + df['title'].fillna('')+ ' ' + df['new_tags'].fillna('')).astype(str)
df['all_text'] = preprocess(df['all_text'])
text_cols = ['channelTitle','description','title','new_tags','all_text']
for cols in text_cols:   
    df[cols] = df[cols].astype(str) 
    df[cols + '_num_cap'] = df[cols].apply(lambda x: count_regexp_occ('[A-Z]', x))
    df[cols + '_num_low'] = df[cols].apply(lambda x: count_regexp_occ('[a-z]', x))
    df[cols + '_num_dig'] = df[cols].apply(lambda x: count_regexp_occ('[0-9]', x))
    df[cols + '_num_engdig'] = df[cols].apply(lambda x: count_regexp_occ('[A-Za-z0-9]', x))    
    df[cols + '_isja'] = df[cols].apply(lambda x: 1 if is_japanese(x) else 0)
    df[cols + '_isalpha'] = df[cols].apply(lambda x: 1 if x.encode('utf-8').isalnum() else 0)
    
    df[cols + '_num_pun'] = df[cols].apply(lambda x: sum(c in punct for c in x))
    df[cols + '_num_space'] = df[cols].apply(lambda x: sum(c.isspace() for c in x))

    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    
    df[cols + '_ratio_unique_words'] = df[cols+'_num_unique_words'] / (df[cols+'_num_words']+1) # Count Unique Words    

    df[cols +'_num_stopwords'] = df[cols].apply(lambda x: len([w for w in x.split() if w in stopwords]))
    df[cols +'_num_words_upper'] = df[cols].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[cols +'_num_words_lower'] = df[cols].apply(lambda x: len([w for w in str(x).split() if w.islower()]))
    df[cols +'_num_words_title'] = df[cols].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[cols +'_music'] = df[cols].apply(lambda x: 1 if 'music' in x.lower() else 0)
    df[cols +'_official'] = df[cols].apply(lambda x: 1 if 'official' in x.lower() else 0)
    df[cols +'_ja_official'] = df[cols].apply(lambda x: 1 if '公式' in x else 0) 
    df[cols +'_cm'] = df[cols].apply(lambda x: 1 if 'cm' in x.lower() else 0)     
    df[cols +'_http'] = df[cols].apply(lambda x: 1 if 'http' in x.lower() else 0)    
    df[cols +'_movie'] = df[cols].apply(lambda x: 1 if 'movie' in x.lower() else 0)    
    df[cols +'_jp'] = df[cols].apply(lambda x: 1 if 'jp' in x.lower() else 0)     
    df[cols +'_youtube'] = df[cols].apply(lambda x: 1 if 'youtube' in x.lower() else 0)         
    df[cols +'_jp_movie'] = df[cols].apply(lambda x: 1 if '映画' in x else 0)      
    df[cols +'_jp_director'] = df[cols].apply(lambda x: 1 if '監督' in x else 0)       
    df[cols +'_jp_tohaku'] = df[cols].apply(lambda x: 1 if '東宝' in x else 0)
    
text_cols = ['description']
for cols in text_cols:       
    df[cols + '_url_num'] = df[cols].apply(lambda x: get_url_num(x))

CPU times: user 1min 16s, sys: 848 ms, total: 1min 17s
Wall time: 1min 21s


In [5]:
%%time
### TFIDF Vectorizer ###
### SVD Components ###
n_comp = 20

for i in ['channelTitle','description','title','all_text']:#,'new_title','new_description',
    print (i)
    tfidf_vec = TfidfVectorizer(analyzer='word',ngram_range=(1,2))
    text_tfidf = tfidf_vec.fit_transform(df[i].values.tolist() )
    text_svd = TruncatedSVD(n_components=n_comp, algorithm='arpack',random_state=9999)
    df_svd = pd.DataFrame(text_svd.fit_transform(text_tfidf))
    df_svd.columns = ['svd_'+str(i)+str(j+1) for j in range(n_comp)]
    df = pd.concat([df,df_svd],axis=1)
    
for i in ['new_tags',]:
    print (i)
    tfidf_vec = TfidfVectorizer(analyzer='word',ngram_range=(1,1))
    text_tfidf = tfidf_vec.fit_transform(df[i].values.tolist() )
    text_svd = TruncatedSVD(n_components=n_comp, algorithm='arpack',random_state=9999)
    df_svd = pd.DataFrame(text_svd.fit_transform(text_tfidf))
    df_svd.columns = ['svd_char_'+str(i)+str(j+1) for j in range(n_comp)]
    df = pd.concat([df,df_svd],axis=1)

channelTitle
['00', '007004ma1', '0094592', '009eel', '009gabry007', '00motivation', '00r', '00r s00', '00throne', '01', '0120baseball', '012hggg', '01msumoto', '0213033', '0305tokusan', '036', '036 alfred', '04', '04 limited', '0515s', '0557mohm1', '0707jh', '0714honey0701', '072edman', '08', '0823305500', '08haqua', '08team', '0943472307', '09809098', '09sourapple', '09team', '09xharle', '0angel0angel0', '0bach0', '0rch1d', '0shams', '0slkd2', '1000piazzia', '100maaya', '100ro', '100均', '100均 コスメ', '1010', '101seibu', '1039knsk', '1080pinki', '10feetvevo', '10goqstudio', '1101yasuhiro', '1103fo', '1116yu', '113icecream', '113icecream japanese', '117cmvol1', '117cmvol2', '117cmvol3', '117cmvol4', '1192taiyaki', '11masaka11', '11qtd529', '12', '12 networks', '1200', '1200nori', '1204120', '12129momo', '1217disney', '1224smapsmap', '1225', '123', '1234totechteta', '1325aaa', '1337', '1385hiro', '139', '13ender', '13luedragon17', '1412nes', '141fumi', '1422singirl', '147db', '1484tv', '1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



CPU times: user 58.2 s, sys: 3.27 s, total: 1min 1s
Wall time: 1min 3s


In [8]:
%%time
from tqdm import tqdm
from scipy import stats
def agg(df,agg_cols):
    for c in tqdm(agg_cols):
        new_feature = '{}_{}_{}'.format('_'.join(c['groupby']), c['agg'], c['target'])
        if c['agg'] == 'diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.diff(c['para1']).shift(c['para2']))
        elif c['agg'] == 'lag':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].shift(c['para1'])
        elif c['agg'] == 'rolling_sum':    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.rolling(c['para1'],min_periods=1).sum().shift(c['para2']))                      
        elif c['agg'] == 'rolling_mean':    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.rolling(c['para1'],min_periods=1).mean().shift(c['para2']))  
        elif c['agg'] == 'rolling_max':    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.rolling(c['para1'],min_periods=1).max().shift(c['para2']))  
        elif c['agg'] == 'rolling_min':    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.rolling(c['para1'],min_periods=1).min().shift(c['para2']))  
        elif c['agg'] == 'rolling_median':    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.rolling(c['para1'],min_periods=1).median().shift(c['para2']))  
        elif c['agg'] == 'rolling_std':    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.rolling(c['para1'],min_periods=1).std().shift(c['para2']))  
        elif c['agg'] == 'cumcount':
            df[new_feature] = df.groupby(c['groupby']).cumcount()   
        elif c['agg'] == 'cumsum':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.cumsum())             
        elif c['agg'] == 'cummax':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.cummax()) 
        elif c['agg'] == 'cummin':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.cummin()) 
        elif c['agg'] == 'cummean':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: x.cumsum()) / (df.groupby(c['groupby']).cumcount() + 1)
        elif c['agg'] == 'mean_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('mean') - df[c['target']]
        elif c['agg'] == 'mean_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('mean') / (1+df[c['target']])
        elif c['agg'] == 'trim_mean':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: stats.trim_mean(x, 0.1))             
        elif c['agg'] == 'trim_mean_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: stats.trim_mean(x, 0.1)) - df[c['target']]
        elif c['agg'] == 'max_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') - df[c['target']]
        elif c['agg'] == 'max_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') / (1+df[c['target']])   
        elif c['agg'] == 'min_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('min')- df[c['target']]
        elif c['agg'] == 'min_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('min')/ (1+df[c['target']])    
        elif c['agg'] == 'max_min_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') - df.groupby(c['groupby'])[c['target']].transform('min')
        elif c['agg'] == 'max_min_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') / (1+df.groupby(c['groupby'])[c['target']].transform('min'))             
        elif c['agg'] == 'median_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('median') - df[c['target']]
        elif c['agg'] == 'median_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('median') / (1+df[c['target']])    
        elif c['agg'] == 'mode':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].apply(pd.Series.mode).reset_index(drop=True)            
        else:    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(c['agg'])

agg_cols = [

# ############################ aggregation##################################
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'count'},
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'count'},
    
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'sum'},
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'mean'},
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'trim_mean'},    
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'median'},    
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'max'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'min'},     
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'std'},    
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'mean_ratio'},  
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'median_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'median_ratio'},     
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'max_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'max_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'min_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'min_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'likes', 'agg':'trim_mean_diff'}, 

    {'groupby': ['categoryId'], 'target':'likes', 'agg':'sum'},
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'mean'},
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'trim_mean'},    
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'median'},    
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'max'}, 
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'min'},     
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'std'},    
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'mean_ratio'},  
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'median_diff'}, 
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'median_ratio'},     
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'max_diff'}, 
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'max_ratio'},
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'min_diff'}, 
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'min_ratio'},
    {'groupby': ['categoryId'], 'target':'likes', 'agg':'trim_mean_diff'}, 
    
  
    {'groupby': ['year'], 'target':'likes', 'agg':'sum'},
    {'groupby': ['year'], 'target':'likes', 'agg':'mean'},
    {'groupby': ['year'], 'target':'likes', 'agg':'trim_mean'},    
    {'groupby': ['year'], 'target':'likes', 'agg':'median'},    
    {'groupby': ['year'], 'target':'likes', 'agg':'max'}, 
    {'groupby': ['year'], 'target':'likes', 'agg':'min'},     
    {'groupby': ['year'], 'target':'likes', 'agg':'std'},    
    {'groupby': ['year'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['year'], 'target':'likes', 'agg':'mean_ratio'},  
    {'groupby': ['year'], 'target':'likes', 'agg':'median_diff'}, 
    {'groupby': ['year'], 'target':'likes', 'agg':'median_ratio'},     
    {'groupby': ['year'], 'target':'likes', 'agg':'max_diff'}, 
    {'groupby': ['year'], 'target':'likes', 'agg':'max_ratio'},
    {'groupby': ['year'], 'target':'likes', 'agg':'min_diff'}, 
    {'groupby': ['year'], 'target':'likes', 'agg':'min_ratio'},
    {'groupby': ['year'], 'target':'likes', 'agg':'trim_mean_diff'}, 
    
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'sum'},
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'mean'},
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'trim_mean'},    
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'median'},    
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'max'}, 
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'min'},     
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'std'},    
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'mean_ratio'},  
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'median_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'median_ratio'},     
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'max_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'max_ratio'},
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'min_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'min_ratio'},
    {'groupby': ['tags_encoder'], 'target':'likes', 'agg':'trim_mean_diff'}, 
 
    {'groupby': ['ratings_disabled'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['ratings_disabled'], 'target':'likes', 'agg':'mean_ratio'}, 
       
    {'groupby': ['comments_disabled'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['comments_disabled'], 'target':'likes', 'agg':'mean_ratio'},     
    
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'sum'},
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'mean'},
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'trim_mean'},    
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'median'},    
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'max'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'min'},     
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'std'},    
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'mean_ratio'},  
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'median_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'median_ratio'},     
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'max_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'max_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'min_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'min_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'dislikes', 'agg':'trim_mean_diff'}, 

    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'sum'},
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'mean'},
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'trim_mean'},    
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'median'},    
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'max'}, 
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'min'},     
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'std'},    
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'mean_ratio'},  
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'median_diff'}, 
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'median_ratio'},     
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'max_diff'}, 
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'max_ratio'},
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'min_diff'}, 
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'min_ratio'},
    {'groupby': ['categoryId'], 'target':'dislikes', 'agg':'trim_mean_diff'}, 
    
  
    {'groupby': ['year'], 'target':'dislikes', 'agg':'sum'},
    {'groupby': ['year'], 'target':'dislikes', 'agg':'mean'},
    {'groupby': ['year'], 'target':'dislikes', 'agg':'trim_mean'},    
    {'groupby': ['year'], 'target':'dislikes', 'agg':'median'},    
    {'groupby': ['year'], 'target':'dislikes', 'agg':'max'}, 
    {'groupby': ['year'], 'target':'dislikes', 'agg':'min'},     
    {'groupby': ['year'], 'target':'dislikes', 'agg':'std'},    
    {'groupby': ['year'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['year'], 'target':'dislikes', 'agg':'mean_ratio'},  
    {'groupby': ['year'], 'target':'dislikes', 'agg':'median_diff'}, 
    {'groupby': ['year'], 'target':'dislikes', 'agg':'median_ratio'},     
    {'groupby': ['year'], 'target':'dislikes', 'agg':'max_diff'}, 
    {'groupby': ['year'], 'target':'dislikes', 'agg':'max_ratio'},
    {'groupby': ['year'], 'target':'dislikes', 'agg':'min_diff'}, 
    {'groupby': ['year'], 'target':'dislikes', 'agg':'min_ratio'},
    {'groupby': ['year'], 'target':'dislikes', 'agg':'trim_mean_diff'}, 
    
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'sum'},
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'mean'},
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'trim_mean'},    
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'median'},    
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'max'}, 
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'min'},     
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'std'},    
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'mean_ratio'},  
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'median_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'median_ratio'},     
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'max_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'max_ratio'},
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'min_diff'}, 
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'min_ratio'},
    {'groupby': ['tags_encoder'], 'target':'dislikes', 'agg':'trim_mean_diff'}, 
 
    {'groupby': ['ratings_disabled'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['ratings_disabled'], 'target':'dislikes', 'agg':'mean_ratio'}, 
       
    {'groupby': ['comments_disabled'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['comments_disabled'], 'target':'dislikes', 'agg':'mean_ratio'},     
]

agg(df,agg_cols)

100%|██████████| 138/138 [01:19<00:00,  1.73it/s]

CPU times: user 1min 15s, sys: 1.74 s, total: 1min 17s
Wall time: 1min 19s





In [9]:
%%time
import lightgbm as lgb
import xgboost as xgb
# import catboost as cat
import pickle
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import svm, neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.metrics import mean_absolute_error,mean_squared_error

def preprocess(train_df,test_df,feats):
    train_df = train_df.replace([np.inf, -np.inf], np.nan)
    train_df = train_df.fillna(0) 

    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.fillna(0)
    
    scaler = StandardScaler()
    train_df[feats] = scaler.fit_transform(train_df[feats])
    test_df[feats] = scaler.transform(test_df[feats])
    
    return train_df[feats], test_df[feats]

def rmse(y_true, y_pred):
    return (mean_squared_error(y_true, y_pred))** .5

def target_encoder_kfold(train_df,test_df,col,target,method):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_df['y_bin'])):
        print ('FOLD:' + str(n_fold))
        train_x = train_df.iloc[train_idx]
        valid_x = train_df.iloc[valid_idx] 
        if method == 'mean':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].apply(lambda x: stats.trim_mean(x, 0.01)))
        if method == 'median':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].median())   
        if method == 'max':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].median())  
        if method == 'min':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].median())              
    if method == 'mean':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].apply(lambda x: stats.trim_mean(x, 0.01)))
    if method == 'median':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].median())
    if method == 'max':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].max())
    if method == 'min':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].min())        
    return oof_preds,sub_preds

def lgb_kfold(train_df,test_df,features,target,cat_features,folds,params,use_pseudo=False,sampling=False):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    cv_list = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[features], train_df[features])):
        print ('FOLD:' + str(n_fold))
        
        train_x, train_y = train_df[features].iloc[train_idx], train_df[target].iloc[train_idx]
        valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx]
        
        # remove outliers
        if sampling is True:
            valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx] 
            train_df_new = train_df.iloc[train_idx]
            train_df_new = train_df_new[train_df_new['y_residual']<3.2]#3
            train_x, train_y = train_df_new[features], train_df_new[target]
            
        # target encoding
#         df_train = train_df.iloc[train_idx]
#         train_df_new,test_df_new = train_df,test_df 
#         cat_features=['age' ,'education' ,'num_child' ,'partner' ,'position' ,'service_length' ,'sex']
#         for col in tqdm(cat_features):
#             train_df_new,test_df_new = target_mean(df_train,train_df_new,test_df_new,col,target)
      
#         print (train_df_new.columns.values)
#         features = [f for f in train_df_new.columns if f not in drop_features]
#         train_x, train_y = train_df_new[features].iloc[train_idx], train_df_new[target].iloc[train_idx]
#         valid_x, valid_y = train_df_new[features].iloc[valid_idx], train_df_new[target].iloc[valid_idx] 
#         test_df = test_df_new.copy()
        
        
       # pseudo 
        if use_pseudo is True:
            train_x = pd.concat([train_x ,pseudo[features]],axis=0)
            train_y = train_y.append(pseudo[target])  
            for n, (pseudo_train_idx, pseudo_valid_idx) in enumerate(folds.split(pseudo[features], pseudo['y_bin'])):
                print ('PSEUDO FOLD:' + str(n))
                if n_fold == n:
                    train_x = pd.concat([train_x ,pseudo[features].iloc[pseudo_valid_idx]],axis=0)
                    train_y = train_y.append(pseudo[target].iloc[pseudo_valid_idx] )
                    break
                    
        print ('train_x shape:',train_x.shape,train_y.mean())
        print ('valid_x shape:',valid_x.shape,valid_y.mean())
        
        dtrain = lgb.Dataset(train_x, label=train_y,categorical_feature=cat_features)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features) 
        bst = lgb.train(params, dtrain, num_boost_round=50000,
            valid_sets=[dval,dtrain], verbose_eval=500,early_stopping_rounds=500, ) 
        new_list = sorted(zip(features, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:30]
        for item in new_list:
            print (item) 
         
        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        oof_cv = rmse(valid_y,  oof_preds[valid_idx])
        cv_list.append(oof_cv)
        print (cv_list)
        sub_preds += bst.predict(test_df[features], num_iteration=bst.best_iteration) / folds.n_splits
 
    cv = rmse(train_df[target],  oof_preds)
    print('Full OOF RMSE %.6f' % cv)  

    train_df['lgb_y'] = oof_preds
    test_df['lgb_y'] = sub_preds
    
    return train_df,test_df,cv

def lgb_kfold2(X_train,X_test,train_df,test_df,features,target,cat_features,folds,params,use_pseudo=False,sampling=False):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    cv_list = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, train_df['channelTitle'])):
        print ('FOLD:' + str(n_fold))
        
#         train_x, train_y = train_df[features].iloc[train_idx], train_df[target].iloc[train_idx]
#         valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx]

        train_x, train_y = X_train[train_idx,:], train_df[target].iloc[train_idx]
        valid_x, valid_y = X_train[valid_idx,:], train_df[target].iloc[valid_idx]
        
        print ('train_x shape:',train_x.shape,train_y.mean())
        print ('valid_x shape:',valid_x.shape,valid_y.mean())
        
        dtrain = lgb.Dataset(train_x, label=train_y,categorical_feature=cat_features)#feature_name=features,
        dval = lgb.Dataset(valid_x, label=valid_y,reference=dtrain,categorical_feature=cat_features) # feature_name=features,
        bst = lgb.train(params, dtrain, num_boost_round=50000,
            valid_sets=[dval,dtrain], verbose_eval=500,early_stopping_rounds=500, ) 
#         new_list = sorted(zip(features, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:100]
#         for item in new_list:
#             print (item) 
         
        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        oof_cv = rmse(valid_y,  oof_preds[valid_idx])
        cv_list.append(oof_cv)
        print (cv_list)
        sub_preds += bst.predict(X_test, num_iteration=bst.best_iteration) / folds.n_splits
 
    cv = rmse(train_df[target],  oof_preds)
    print('Full OOF RMSE %.6f' % cv)  

    train_df['prediction'] = oof_preds
    test_df['prediction'] = sub_preds
    
    return train_df,test_df,cv

params = {
               "objective" : "regression", #regression
               "boosting" : "gbdt", 
               "metric" : "rmse",  
               "max_depth": -1,
               "min_data_in_leaf": 10, #10
               "min_gain_to_split": 0.01,#0.01
                "min_child_weight": 0.001,
                "reg_alpha": 0.1, 
                "reg_lambda": 1, #1
               "num_leaves" : 31, #50
               "max_bin" : 300,#300 
              "learning_rate" :0.05,
               "bagging_fraction" : 0.9,
               "bagging_freq" : 1,
               "bagging_seed" : 4590,
               "feature_fraction" : 0.6,#0.85
               "verbosity": -1,
               "boost_from_average": False,
}

train_df = df[df['comments_disabled_encoder']==0]
test_df = df[df['comments_disabled_encoder']==1]
train_df['comment_count'] = np.log1p(train_df['comment_count'])


drop_features=[ 'channelId', 'channelTitle', 'c_date','collection_date','image','comments_ratings_disabled',
        'description', 'ratio_title_des','comments_disabled',
       'id',  'publishedAt',  'tags','new_tags','all_text','new_title', 'new_description',
       'thumbnail_link', 'title', 'video_id', 'y','y_bin','lgb_y','comments_ratings',  'comment_count',
      'ratings_disabled_encoder', 'comments_ratings_encoder',
                            
              ]


features = [f for f in train_df.columns if f not in drop_features]
target = 'comment_count'
cat_features = [
]


seed = 817
folds = KFold(n_splits=5, shuffle=True, random_state=seed)

print ('numerical features:', len(features),features)# 

train_lgb,test_lgb,cv = lgb_kfold(train_df,test_df,features,target,cat_features,folds,params,use_pseudo=False,sampling=False)

train_lgb['comment_count'] = np.expm1(train_lgb['comment_count'])
train_lgb['comment_count_pred'] = np.expm1(train_lgb['lgb_y'])
test_lgb['comment_count_pred'] = np.expm1(test_lgb['lgb_y'])

train_ = train_lgb[['video_id','comment_count_pred']]
test_ = test_lgb[['video_id','comment_count_pred']]
df_ = pd.concat([train_,test_],axis=0)
df_['comment_count_pred'] = df_['comment_count_pred'].map(lambda x:0 if x<0 else x)
df_[['video_id','comment_count_pred']].to_csv('../../data/input/probspace/comment_count_pred_0623.csv',index=False)

numerical features: 409 ['categoryId', 'likes', 'dislikes', 'ratings_disabled', 'channelId_encoder', 'channelTitle_encoder', 'collection_date_encoder', 'description_encoder', 'tags_encoder', 'comments_disabled_encoder', 'c_year', 'c_month', 'c_day', 'c_dayofweek', 'year', 'month', 'weekofyear', 'day', 'dayofweek', 'hour', 'minute', 'seconds_from_publish', 'days_from_publish', 'months_from_publish', 'years_from_publish', 'days_from_publish_start', 'days_from_cdate_start', 'diff_likes_dislikes', 'ratio_likes_dislikes', 'likes_by_day', 'dislikes_by_day', 'likes_by_month', 'dislikes_by_month', 'likes_by_year', 'dislikes_by_year', 'channelTitle_num_cap', 'channelTitle_num_low', 'channelTitle_num_dig', 'channelTitle_num_engdig', 'channelTitle_isja', 'channelTitle_isalpha', 'channelTitle_num_pun', 'channelTitle_num_space', 'channelTitle_num_chars', 'channelTitle_num_words', 'channelTitle_num_unique_words', 'channelTitle_ratio_unique_words', 'channelTitle_num_stopwords', 'channelTitle_num_word