In [15]:
%%time
import pandas as pd
import numpy as np
import statistics
import os
import random
import MeCab 
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.precision", 8)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
#     tf.random.set_seed(seed)
    


# seed
seed = 817
seed_everything(seed)

# load train test
train = pd.read_csv('../../data/input/probspace/train_data.csv')
train['y_bin'] = pd.cut(train['y'], [0, 10, 100,1000,10000,100000,1000000,10000000000], labels=[1,2,3,4,5,6,7])
train['y_bin'] = train['y_bin'].astype(int)
test = pd.read_csv('../../data/input/probspace/test_data.csv')
df = pd.concat([train,test],axis=0).reset_index(drop=True)
print ('train',train.shape)
print ('test',test.shape)
df['comments_ratings'] = df['comments_disabled'].astype(str)+df['ratings_disabled'].astype(str)
    
for c in ['channelId','channelTitle','collection_date','description','tags','comments_disabled','ratings_disabled','comments_ratings']:
    lbl = LabelEncoder()
    df[c+'_encoder'] = lbl.fit_transform(df[c].astype(str))
    
df['length_title'] = [len(d) for d in df['title']]
    
# use predicted dislikes,likes,comment_out
dislikes_pred = pd.read_csv('../../data/input/probspace/dislikes_pred_0623.csv')
likes_pred = pd.read_csv('../../data/input/probspace/likes_pred_0623.csv')
comments_pred = pd.read_csv('../../data/input/probspace/comment_count_pred_0623.csv')
df = df.merge(dislikes_pred,on=['video_id'],how='left')
df = df.merge(likes_pred,on=['video_id'],how='left')
df = df.merge(comments_pred,on=['video_id'],how='left')
df['diff_dislikes'] = df['dislikes'] - df['dislikes_pred']
df['diff_likes'] = df['likes'] - df['likes_pred']
df['diff_comments'] = df['comment_count'] - df['comment_count_pred']
df['original_dislikes'] = df['dislikes']
df['original_likes'] = df['likes']
df['original_comment_count'] = df['comment_count']
df.loc[df['ratings_disabled']==True,'dislikes'] = df.loc[df['ratings_disabled']==True,'dislikes_pred']
df.loc[df['ratings_disabled']==True,'likes'] = df.loc[df['ratings_disabled']==True,'likes_pred']
df.loc[df['comments_disabled']==True,'comment_count'] = df.loc[df['comments_disabled']==True,'comment_count_pred']

# timestamp transformation
df["c_date"] = "20" + df["collection_date"]
df["c_date"] = pd.to_datetime(df["c_date"], utc=True, format="%Y.%d.%m")
df["c_year"] = df["c_date"].dt.year
df["c_month"] = df["c_date"].dt.month
df["c_day"] = df["c_date"].dt.day
df["c_dayofweek"] = df["c_date"].dt.dayofweek

df["publishedAt"] = pd.to_datetime(df["publishedAt"],utc=True, format="%Y-%m-%d")
df["past"] = (df["publishedAt"].dt.floor("D").max() - df["publishedAt"].dt.floor("D")).dt.days
df["year"] = df["publishedAt"].dt.year
df["month"] = df["publishedAt"].dt.month
df["weekofyear"] = df["publishedAt"].dt.weekofyear
df["day"] = df["publishedAt"].dt.day
df["dayofweek"] = df["publishedAt"].dt.dayofweek
df["hour"] = df["publishedAt"].dt.hour
df["minute"] = df["publishedAt"].dt.minute    

# 公開されてからの収集するまで日数
df['seconds_from_publish'] = (df['c_date'] - df['publishedAt']).dt.seconds
df['days_from_publish'] = (df['c_date'] - df['publishedAt']).dt.days
df['months_from_publish'] = (df['c_date'] - df['publishedAt']).dt.days // 30
df['years_from_publish'] = (df['c_date'] - df['publishedAt']).dt.days // 365

# YouTubeが解説されてから公開までの日数 
df['days_from_publish_start'] = (df['publishedAt'] - df['publishedAt'].min()).dt.days
df['month_from_publish_start'] = (df['publishedAt'] - df['publishedAt'].min()).dt.days // 30
df['year_from_publish_start'] = (df['publishedAt'] - df['publishedAt'].min()).dt.days // 365

# データ収取を始めてからの日数
df['days_from_cdate_start'] = (df['c_date'] - df['c_date'].min()).dt.days

# youtubeができてからの1日あたりのlike数、dislike数、コメント数
df['like_per_published_day'] = (df['likes'] / df['days_from_publish_start']).replace([np.inf, -np.inf], 0)
df['dislike_per_published_day'] = (df['dislikes'] / df['days_from_publish_start']).replace([np.inf, -np.inf], 0)
df['comment_count_per_published_day'] = (df['comment_count'] / df['days_from_publish_start']).replace([np.inf, -np.inf], 0)

# youtubeができてからの1ヶ月あたりのlike数、dislike数、コメント数
df['like_per_published_month'] = (df['likes'] / df['month_from_publish_start']).replace([np.inf, -np.inf], 0)
df['dislike_per_published_month'] = (df['dislikes'] / df['month_from_publish_start']).replace([np.inf, -np.inf], 0)
df['comment_count_per_published_month'] = (df['comment_count'] / df['month_from_publish_start']).replace([np.inf, -np.inf], 0)

# youtubeができてからの1年あたりのlike数、dislike数、コメント数
df['like_per_published_year'] = (df['likes'] / df['year_from_publish_start']).replace([np.inf, -np.inf], 0)
df['dislike_per_published_year'] = (df['dislikes'] / df['year_from_publish_start']).replace([np.inf, -np.inf], 0)
df['comment_count_per_published_year'] = (df['comment_count'] / df['year_from_publish_start']).replace([np.inf, -np.inf], 0)

# interaction 
df['comments_disabled'] = df['comments_disabled'].map(lambda x:1 if x==True else 0)
df['ratings_disabled'] = df['ratings_disabled'].map(lambda x:1 if x==True else 0)

df['likes_comments'] = df['likes'] * df['comments_disabled']
df['dislikes_comments'] = df['dislikes'] * df['comments_disabled']
df['comment_count_ratings'] = df['comment_count'] * df['ratings_disabled']

df['comments_ratings_disabled'] = df['comments_disabled'] + df['ratings_disabled']
df['diff_likes_dislikes'] = df['likes'] - df['dislikes'] 
df['ratio_likes_dislikes'] = df['likes'] / (df['dislikes'] + 1)
df['ratio_likes_comment_count'] = df['likes'] / (df['comment_count'] + 1)
df['ratio_dislikes_comment_count'] = df['dislikes'] / (df['comment_count'] + 1)

# 1日あたりのlike数、dislike数、コメント数
df['likes_by_day'] = df['likes'] / df['days_from_publish']
df['dislikes_by_day'] = df['dislikes'] / df['days_from_publish']
df['comments_by_day'] = df['comment_count'] / df['days_from_publish']

# 1ヶ月あたりのlike数、dislike数、コメント数
df['likes_by_month'] = df['likes'] / df['months_from_publish']
df['dislikes_by_month'] = df['dislikes'] / df['months_from_publish']
df['comments_by_month'] = df['comment_count'] / df['months_from_publish']

# 1年あたりのlike数、dislike数、コメント数
df['likes_by_year'] = df['likes'] / df['years_from_publish']
df['dislikes_by_year'] = df['dislikes'] / df['years_from_publish']
df['comments_by_year'] = df['comment_count'] / df['years_from_publish']

# 標準得点
df['likes_std_score'] = (df['likes'] - df['likes'].mean()) / statistics.stdev(list(df['likes']))
df['dislikes_std_score'] = (df['dislikes'] - df['dislikes'].mean()) / statistics.stdev(list(df['dislikes']))
df['commentss_std_score'] = (df['comment_count'] - df['comment_count'].mean()) / statistics.stdev(list(df['comment_count']))

print ('df',df.shape)
display(df.head())
display(df.columns.values)

train (19720, 18)
test (29582, 16)
df (49302, 87)


Unnamed: 0,id,video_id,title,publishedAt,channelId,channelTitle,categoryId,collection_date,tags,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,y,y_bin,comments_ratings,channelId_encoder,channelTitle_encoder,collection_date_encoder,description_encoder,tags_encoder,comments_disabled_encoder,ratings_disabled_encoder,comments_ratings_encoder,length_title,dislikes_pred,likes_pred,comment_count_pred,diff_dislikes,diff_likes,diff_comments,original_dislikes,original_likes,original_comment_count,c_date,c_year,c_month,c_day,c_dayofweek,past,year,month,weekofyear,day,dayofweek,hour,minute,seconds_from_publish,days_from_publish,months_from_publish,years_from_publish,days_from_publish_start,month_from_publish_start,year_from_publish_start,days_from_cdate_start,like_per_published_day,dislike_per_published_day,comment_count_per_published_day,like_per_published_month,dislike_per_published_month,comment_count_per_published_month,like_per_published_year,dislike_per_published_year,comment_count_per_published_year,likes_comments,dislikes_comments,comment_count_ratings,comments_ratings_disabled,diff_likes_dislikes,ratio_likes_dislikes,ratio_likes_comment_count,ratio_dislikes_comment_count,likes_by_day,dislikes_by_day,comments_by_day,likes_by_month,dislikes_by_month,comments_by_month,likes_by_year,dislikes_by_year,comments_by_year,likes_std_score,dislikes_std_score,commentss_std_score
0,1,GDtyztIThRQ,[12] BGM Inazuma Eleven 3 - ~ライオコツト ダンジョン~,2011-01-09 05:50:33+00:00,UCQaNYC3dNvH8FqrEyK7hTJw,DjangoShiny,20,20.01.02,Inazuma|Eleven|Super|Once|bgm|ost|イナズマイレブン|Kyoui|no|Shinryakusha|sekai|he|chosen|challenge|to|the|world|anime|game|ds|music|soundtrack|background|t-pistonz+kmc|berryz|fire|blizzard|spark|bomber|ogre|rip|endou|endo|mark|goenji|kidou|fubuki|aki|kazemaru|someoka|kabeyama|alien|hiroto|midorikawa|song|themes|battle|ffi,114.0,0.0,7.0,https://i.ytimg.com/vi/GDtyztIThRQ/default.jpg,0,0,~ライオコツト ダンジョン~Inazuma Eleven 3 BGM Complete (Ripped by Tommy),29229.0,5.0,FalseFalse,7498,2223,5,28187,7094,0,0,0,42,4.37847386,69.14827722,10.9578061,-4.37847386,44.85172278,-3.9578061,0,114,7,2020-02-01 00:00:00+00:00,2020,2,1,5,3268,2011,1,1,9,6,5,50,65367,3309,110,9,2086,69,5,41,0.05465005,0.0,0.0033557,1.65217391,0.0,0.10144928,22.8,0.0,1.4,0.0,0.0,0.0,0,114.0,114.0,14.25,0.0,0.0344515,0.0,0.00211544,1.03636364,0.0,0.06363636,12.66666667,0.0,0.77777778,-0.12927468,-0.08150842,-0.05837709
1,2,m4H9s3GtTlQ,ねごと - メルシールー [Official Music Video],2012-07-23 03:00:09+00:00,UChMWDi-HBm5aS3jyRSaAWUA,ねごと Official Channel,10,20.08.02,ねごと|ネゴト|メルシールー|Re:myend|リマインド|Lightdentity|ライデンティティ|放課後ミッドナイターズ|ナースフル|竹清仁|ループ|シャープ|shrap ♯|蒼山幸子|澤村小夜子|藤咲佑|沙田瑞紀|Hello! Z|ex Negoto,2885.0,50.0,111.0,https://i.ytimg.com/vi/m4H9s3GtTlQ/default.jpg,0,0,http://www.negoto.com/全員平成生まれ、蒼山幸子（Vo＆Key)、沙田瑞紀（G)、藤咲佑（Ba）、澤村小夜子（Dr）からなるオルタナティブでファンタジックなロックを鳴らすガールズ4ピースバンド＜ねごと＞。2011年7月13日リリースの1st Full Album「ex Negoto」より「メルシールー」。孤独になってしまったとき、胸の奥にしまっていた大切なことを思い出せたら、すべてがまぶしく見える、そんな瞬間を描いた歌詞と相成る楽曲はシンセ音がエッセンスとなったライブでも盛り上がり必至のナンバー。,730280.0,6.0,FalseFalse,12289,15874,12,26010,29252,0,0,0,35,64.04819316,3159.78843121,102.95138114,-14.04819316,-274.78843121,8.04861886,50,2885,111,2020-02-08 00:00:00+00:00,2020,2,8,5,2707,2012,7,30,23,0,3,0,75591,2755,91,7,2646,88,7,48,1.09032502,0.01889645,0.04195011,32.78409091,0.56818182,1.26136364,412.14285714,7.14285714,15.85714286,0.0,0.0,0.0,0,2835.0,56.56862745,25.75892857,0.44642857,1.04718693,0.01814882,0.04029038,31.7032967,0.54945055,1.21978022,412.14285714,7.14285714,15.85714286,-0.10644434,-0.07668421,-0.05324998
2,3,z19zYZuLuEU,VF3tb 闇よだれvsちび太 (SEGA),2007-07-26 13:54:09+00:00,UCBdcyoZSt5HBLd_n6we-xIg,siropai,24,20.14.01,VF3|VF4|VF5|ちび太|闇よだれ|chibita|virtuafighter|sega|バーチャ,133.0,17.0,14.0,https://i.ytimg.com/vi/z19zYZuLuEU/default.jpg,0,0,Beat-tribe cup finalhttp://ameblo.jp/siropai/,80667.0,5.0,FalseFalse,3392,14185,16,5711,16068,0,0,0,22,6.45271986,85.10976147,21.57864536,10.54728014,47.89023853,-7.57864536,17,133,14,2020-01-14 00:00:00+00:00,2020,1,14,1,4531,2007,7,30,26,3,13,54,36351,4554,151,12,823,27,2,23,0.16160389,0.02065614,0.01701094,4.92592593,0.62962963,0.51851852,66.5,8.5,7.0,0.0,0.0,0.0,0,116.0,7.38888889,8.86666667,1.13333333,0.02920509,0.00373298,0.00307422,0.8807947,0.11258278,0.09271523,11.08333333,1.41666667,1.16666667,-0.12911814,-0.07986818,-0.058032
3,4,pmcIOsL7s98,free frosty weekend!,2005-05-15 02:38:43+00:00,UC7K5am1UAQEsCRhzXpi9i1g,Jones4Carrie,22,19.22.12,frosty,287.0,51.0,173.0,https://i.ytimg.com/vi/pmcIOsL7s98/default.jpg,0,0,I look so bad but look at me!,34826.0,5.0,FalseFalse,2243,3922,0,11020,20320,0,0,0,20,26.44158934,427.02746361,125.20247855,24.55841066,-140.02746361,47.79752145,51,287,173,2019-12-22 00:00:00+00:00,2019,12,22,6,5333,2005,5,19,15,6,2,38,76877,5333,177,14,20,0,0,0,14.35,2.55,8.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,236.0,5.51923077,1.64942529,0.29310345,0.05381586,0.0095631,0.03243953,1.62146893,0.28813559,0.97740113,20.5,3.64285714,12.35714286,-0.12784933,-0.07658772,-0.05019344
4,5,ZuQgsTcuM-4,トップ・オブ・ザ・ワールド,2007-09-09 09:52:47+00:00,UCTW1um4R-QWa8iIfITGvlZQ,Tatsuya Maruyama,10,20.08.01,ギター|guitar|南澤大介|トップオブザワールド|トップ|オブ|ワールド|カーペンターズ|クラシックギター|ソロギターのしらべ|まるちゃん0208|まるやまたつや,178.0,6.0,17.0,https://i.ytimg.com/vi/ZuQgsTcuM-4/default.jpg,0,0,ソロギターのしらべより「トップオブザワールド」です。クラシックギターで弾いてます。Official Website 【http://maruyama-tatsuya.jimdo.com/】Twitter【https://twitter.com/TatsuyaMaruyama】,172727.0,6.0,FalseFalse,8306,7711,11,34917,30615,0,0,0,13,11.01195976,248.24972466,23.09445616,-5.01195976,-70.24972466,-6.09445616,6,178,17,2020-01-08 00:00:00+00:00,2020,1,8,2,4486,2007,9,36,9,6,9,52,50833,4503,150,12,868,28,2,17,0.20506912,0.00691244,0.01958525,6.35714286,0.21428571,0.60714286,89.0,3.0,8.5,0.0,0.0,0.0,0,172.0,25.42857143,9.88888889,0.33333333,0.0395292,0.00133245,0.00377526,1.18666667,0.04,0.11333333,14.83333333,0.5,1.41666667,-0.12874738,-0.08092951,-0.0578841


array(['id', 'video_id', 'title', 'publishedAt', 'channelId',
       'channelTitle', 'categoryId', 'collection_date', 'tags', 'likes',
       'dislikes', 'comment_count', 'thumbnail_link', 'comments_disabled',
       'ratings_disabled', 'description', 'y', 'y_bin',
       'comments_ratings', 'channelId_encoder', 'channelTitle_encoder',
       'collection_date_encoder', 'description_encoder', 'tags_encoder',
       'comments_disabled_encoder', 'ratings_disabled_encoder',
       'comments_ratings_encoder', 'length_title', 'dislikes_pred',
       'likes_pred', 'comment_count_pred', 'diff_dislikes', 'diff_likes',
       'diff_comments', 'original_dislikes', 'original_likes',
       'original_comment_count', 'c_date', 'c_year', 'c_month', 'c_day',
       'c_dayofweek', 'past', 'year', 'month', 'weekofyear', 'day',
       'dayofweek', 'hour', 'minute', 'seconds_from_publish',
       'days_from_publish', 'months_from_publish', 'years_from_publish',
       'days_from_publish_start', 'month_fro

CPU times: user 2.93 s, sys: 599 ms, total: 3.53 s
Wall time: 4.52 s


In [16]:
%%time
import re
import string
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import unicodedata

class MecabTokenizer:
    def __init__(self):
        self.wakati = MeCab.Tagger('-Owakati')
        self.wakati.parse('')

    def tokenize(self, line):
        txt = self.wakati.parse(line)
        txt = txt.split()
        return txt
    
    def mecab_tokenizer(self, line):
        node = self.wakati.parseToNode(line)
        keywords = []
        while node:
            if node.feature.split(",")[0] == "名詞" or node.feature.split(",")[0] == "形容詞":
                keywords.append(node.surface)
            node = node.next
        return keywords    
    
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


html_tags = ['<p>', '</p>', '<table>', '</table>', '<tr>', '</tr>', '<ul>', '<ol>', '<dl>', '</ul>', '</ol>',
             '</dl>', '<li>', '<dd>', '<dt>', '</li>', '</dd>', '</dt>', '<h1>', '</h1>',
             '<br>', '<br/>', '<strong>', '</strong>', '<span>', '</span>', '<blockquote>', '</blockquote>',
             '<pre>', '</pre>', '<div>', '</div>', '<h2>', '</h2>', '<h3>', '</h3>', '<h4>', '</h4>', '<h5>', '</h5>',
             '<h6>', '</h6>', '<blck>', '<pr>', '<code>', '<th>', '</th>', '<td>', '</td>', '<em>', '</em>']

empty_expressions = ['&lt;', '&gt;', '&amp;', '&nbsp;', 
                     '&emsp;', '&ndash;', '&mdash;', '&ensp;'
                     '&quot;', '&#39;']

other = ['span', 'style', 'href', 'input']


def pre_preprocess(x):
    return str(x).lower()

def rm_spaces(text):
    spaces = ['\u200b', '\u200e', '\u202a', '\u2009', '\u2028', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\u3000', '\x10', '\x7f', '\x9d', '\xad',
              '\x97', '\x9c', '\x8b', '\x81', '\x80', '\x8c', '\x85', '\x92', '\x88', '\x8d', '\x80', '\x8e', '\x9a', '\x94', '\xa0', 
              '\x8f', '\x82', '\x8a', '\x93', '\x90', '\x83', '\x96', '\x9b', '\x9e', '\x99', '\x87', '\x84', '\x9f',
             ]
    for space in spaces:
            text = text.replace(space, ' ')
    return text

def remove_urls(x):
    x = re.sub(r'(https?://[a-zA-Z0-9.-]*)', r'', x)

    # original
    x = re.sub(r'(quote=\w+\s?\w+;?\w+)', r'', x)
    return x

def clean_html_tags(x, stop_words=[]):      
    for r in html_tags:
        x = x.replace(r, '')
    for r in empty_expressions:
        x = x.replace(r, ' ')
    for r in stop_words:
        x = x.replace(r, '')
    return x

def replace_num(text):
    text = re.sub('[0-9]{5,}', '', text)
    text = re.sub('[0-9]{4}', '', text)
    text = re.sub('[0-9]{3}', '', text)
    text = re.sub('[0-9]{2}', '', text)
    return text

def get_url_num(x):
    pattern = "https?://[\w/:%#\$&\?\(\)~\.=\+\-]+"
    urls = re.findall(pattern, x)
    return len(urls)


def clean_puncts(x):
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

#zenkaku = '０,１,２,３,４,５,６,７,８,９,（,）,＊,「,」,［,］,【,】,＜,＞,？,・,＃,＠,＄,％,＝'.split(',')
#hankaku = '0,1,2,3,4,5,6,7,8,9,q,a,z,w,s,x,c,d,e,r,f,v,b,g,t,y,h,n,m,j,u,i,k,l,o,p'.split(',')

def clean_text_jp(x):
    x = x.replace('。', '')
    x = x.replace('、', '')
    x = x.replace('\n', '') # 改行削除
    x = x.replace('\t', '') # タブ削除
    x = x.replace('\r', '')
    x = re.sub(re.compile(r'[!-\/:-@[-`{-~]'), ' ', x) 
    x = re.sub(r'\[math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\[\/math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\\', ' LaTex ', x) # LaTex削除   
    #for r in zenkaku+hankaku:
    #    x = x.replace(str(r), '')
    x = re.sub(' +', ' ', x)
    return x


def preprocess(data):
    data = data.apply(lambda x: pre_preprocess(x))
    data = data.apply(lambda x: rm_spaces(x))
    data = data.apply(lambda x: remove_urls(x))
    data = data.apply(lambda x: clean_puncts(x))
   # data = data.apply(lambda x: replace_num(x))
    data = data.apply(lambda x: clean_html_tags(x, stop_words=other))
    data = data.apply(lambda x: clean_text_jp(x))
    return data    

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

def is_japanese(string):
    for ch in string:
        try:
            name = unicodedata.name(ch) 
            if "CJK UNIFIED" in name \
            or "HIRAGANA" in name \
            or "KATAKANA" in name:
                return True
        except:
          continue
    return False

stopwords = {x: 1 for x in stopwords.words('english')}
punct = set(string.punctuation)

df['new_tags'] = df['tags'].astype(str).apply(lambda x: x.replace('|',' '))
df['all_text'] =  (df['channelTitle'].fillna('') + ' ' + df['description'].fillna('') + ' ' + df['title'].fillna('')+ ' ' + df['new_tags'].fillna('')).astype(str)
df['all_text'] = preprocess(df['all_text'])
text_cols = ['channelTitle','description','title','new_tags','all_text']
for cols in text_cols:   
    df[cols] = df[cols].astype(str) 
    df[cols + '_num_cap'] = df[cols].apply(lambda x: count_regexp_occ('[A-Z]', x))
    df[cols + '_num_low'] = df[cols].apply(lambda x: count_regexp_occ('[a-z]', x))
    df[cols + '_num_dig'] = df[cols].apply(lambda x: count_regexp_occ('[0-9]', x))
    df[cols + '_num_engdig'] = df[cols].apply(lambda x: count_regexp_occ('[A-Za-z0-9]', x))    
    df[cols + '_isja'] = df[cols].apply(lambda x: 1 if is_japanese(x) else 0)
    df[cols + '_isalpha'] = df[cols].apply(lambda x: 1 if x.encode('utf-8').isalnum() else 0)
    
    df[cols + '_num_pun'] = df[cols].apply(lambda x: sum(c in punct for c in x))
    df[cols + '_num_space'] = df[cols].apply(lambda x: sum(c.isspace() for c in x))

    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    
    df[cols + '_ratio_unique_words'] = df[cols+'_num_unique_words'] / (df[cols+'_num_words']+1) # Count Unique Words    

    df[cols +'_num_stopwords'] = df[cols].apply(lambda x: len([w for w in x.split() if w in stopwords]))
    df[cols +'_num_words_upper'] = df[cols].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[cols +'_num_words_lower'] = df[cols].apply(lambda x: len([w for w in str(x).split() if w.islower()]))
    df[cols +'_num_words_title'] = df[cols].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
       
for cols in ['description']:  
    df[cols] = df[cols].astype(str)     
    df[cols + '_url_num'] = df[cols].apply(lambda x: get_url_num(x))      



CPU times: user 1min 7s, sys: 475 ms, total: 1min 7s
Wall time: 1min 9s


In [17]:

%%time
### TFIDF Vectorizer ###
### SVD Components ###
n_comp = 20

for i in ['channelTitle','description','title','all_text']:#,'new_title','new_description',
    print (i)
    tfidf_vec = TfidfVectorizer(analyzer='word',ngram_range=(1,2))
    text_tfidf = tfidf_vec.fit_transform(df[i].values.tolist() )
    text_svd = TruncatedSVD(n_components=n_comp, algorithm='arpack',random_state=9999)
    df_svd = pd.DataFrame(text_svd.fit_transform(text_tfidf))
    df_svd.columns = ['svd_'+str(i)+str(j+1) for j in range(n_comp)]
    df = pd.concat([df,df_svd],axis=1)
    
for i in ['new_tags',]:
    print (i)
    tfidf_vec = TfidfVectorizer(analyzer='word',ngram_range=(1,1))
    text_tfidf = tfidf_vec.fit_transform(df[i].values.tolist() )
    text_svd = TruncatedSVD(n_components=n_comp, algorithm='arpack',random_state=9999)
    df_svd = pd.DataFrame(text_svd.fit_transform(text_tfidf))
    df_svd.columns = ['svd_char_'+str(i)+str(j+1) for j in range(n_comp)]
    df = pd.concat([df,df_svd],axis=1)


channelTitle
description
title
all_text
new_tags
CPU times: user 1min 31s, sys: 6.59 s, total: 1min 38s
Wall time: 1min 18s


In [18]:
%%time
from tqdm import tqdm
from scipy import stats
def agg(df,agg_cols):
    for c in tqdm(agg_cols):
        new_feature = '{}_{}_{}'.format('_'.join(c['groupby']), c['agg'], c['target'])
        if c['agg'] == 'mean_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('mean') - df[c['target']]
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'mean_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('mean') / (1+df[c['target']])
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'trim_mean':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: stats.trim_mean(x, 0.1))
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'trim_mean_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(lambda x: stats.trim_mean(x, 0.1)) - df[c['target']]
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'max_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') - df[c['target']]
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'max_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') / (1+df[c['target']])
            df[new_feature+'rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'min_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('min')- df[c['target']]
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'min_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('min')/ (1+df[c['target']])
            df[new_feature+'_rank'] = df.groupby(c['groupby'])[c['target']].rank()
        elif c['agg'] == 'max_min_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') - df.groupby(c['groupby'])[c['target']].transform('min')
        elif c['agg'] == 'max_min_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('max') / (1+df.groupby(c['groupby'])[c['target']].transform('min'))             
        elif c['agg'] == 'median_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('median') - df[c['target']]
        elif c['agg'] == 'median_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('median') / (1+df[c['target']])    
        elif c['agg'] == 'mode':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].apply(pd.Series.mode).reset_index(drop=True)        
        else:    
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(c['agg'])
            
    return df

agg_cols = [

# ############################ aggregation##################################

    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'trim_mean'}, 
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'trim_mean_diff'},
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'max_diff'},
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'max_ratio'},
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'mean_ratio'},
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'mean_diff'}, 
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'min_diff'},     
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'min_ratio'},   
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'max_min_diff'},
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'max_min_ratio'},     
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'median_diff'},    
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'median_ratio'},
    {'groupby': ['categoryId'], 'target':'original_likes', 'agg':'mode'}, 
    
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'trim_mean'}, 
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'trim_mean_diff'},
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'max_diff'},
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'max_ratio'},
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'mean_ratio'},
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'min_diff'},     
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'min_ratio'},   
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'max_min_diff'},
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'max_min_ratio'},     
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'median_diff'},    
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'median_ratio'},
    {'groupby': ['categoryId'], 'target':'original_dislikes', 'agg':'mode'}, 
    
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'trim_mean'}, 
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'trim_mean_diff'},
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'max_diff'},
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'max_ratio'},
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'mean_ratio'},
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'mean_diff'}, 
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'min_diff'},     
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'min_ratio'},   
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'max_min_diff'},
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'max_min_ratio'},     
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'median_diff'},    
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'median_ratio'},
    {'groupby': ['categoryId'], 'target':'original_comment_count', 'agg':'mode'}, 
    
    {'groupby': ['year'], 'target':'original_likes', 'agg':'trim_mean'}, 
    {'groupby': ['year'], 'target':'original_likes', 'agg':'trim_mean_diff'},
    {'groupby': ['year'], 'target':'original_likes', 'agg':'max_diff'},
    {'groupby': ['year'], 'target':'original_likes', 'agg':'max_ratio'},
    {'groupby': ['year'], 'target':'original_likes', 'agg':'mean_ratio'},
    {'groupby': ['year'], 'target':'original_likes', 'agg':'mean_diff'}, 
    {'groupby': ['year'], 'target':'original_likes', 'agg':'min_diff'},     
    {'groupby': ['year'], 'target':'original_likes', 'agg':'min_ratio'},   
    {'groupby': ['year'], 'target':'original_likes', 'agg':'max_min_diff'},
    {'groupby': ['year'], 'target':'original_likes', 'agg':'max_min_ratio'},     
    {'groupby': ['year'], 'target':'original_likes', 'agg':'median_diff'},    
    {'groupby': ['year'], 'target':'original_likes', 'agg':'median_ratio'},
    {'groupby': ['year'], 'target':'original_likes', 'agg':'mode'}, 
    
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'trim_mean'}, 
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'trim_mean_diff'},
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'max_diff'},
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'max_ratio'},
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'mean_ratio'},
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'min_diff'},     
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'min_ratio'},   
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'max_min_diff'},
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'max_min_ratio'},     
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'median_diff'},    
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'median_ratio'},
    {'groupby': ['year'], 'target':'original_dislikes', 'agg':'mode'}, 
    
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'trim_mean'}, 
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'trim_mean_diff'},
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'max_diff'},
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'max_ratio'},
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'mean_ratio'},
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'mean_diff'}, 
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'min_diff'},     
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'min_ratio'},   
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'max_min_diff'},
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'max_min_ratio'},     
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'median_diff'},    
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'median_ratio'},
    {'groupby': ['year'], 'target':'original_comment_count', 'agg':'mode'}, 
    

    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'trim_mean'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'trim_mean_diff'},
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'max_diff'},
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'max_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'mean_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'min_diff'},     
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'min_ratio'},   
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'max_min_diff'},
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'max_min_ratio'},     
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'median_diff'},    
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'median_ratio'},
    {'groupby': ['channelTitle_encoder'], 'target':'diff_likes_dislikes', 'agg':'mode'}, 
    
  

    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'trim_mean'}, 
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'trim_mean_diff'},
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'max_diff'},
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'max_ratio'},
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'mean_ratio'},
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'min_diff'},     
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'min_ratio'},   
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'max_min_diff'},
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'max_min_ratio'},     
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'median_diff'},    
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'median_ratio'},
    {'groupby': ['categoryId'], 'target':'diff_likes_dislikes', 'agg':'mode'}, 
    
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'trim_mean'}, 
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'trim_mean_diff'},
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'max_diff'},
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'max_ratio'},
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'mean_ratio'},
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'min_diff'},     
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'min_ratio'},   
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'max_min_diff'},
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'max_min_ratio'},     
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'median_diff'},    
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'median_ratio'},
    {'groupby': ['year'], 'target':'diff_likes_dislikes', 'agg':'mode'}, 
     

 
    {'groupby': ['ratings_disabled'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['ratings_disabled'], 'target':'likes', 'agg':'mean_ratio'}, 
    
    {'groupby': ['ratings_disabled'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['ratings_disabled'], 'target':'dislikes', 'agg':'mean_ratio'},    
    
  
    {'groupby': ['ratings_disabled'], 'target':'comment_count', 'agg':'mean_diff'}, 
    {'groupby': ['ratings_disabled'], 'target':'comment_count', 'agg':'mean_ratio'}, 
       
    {'groupby': ['comments_disabled'], 'target':'likes', 'agg':'mean_diff'}, 
    {'groupby': ['comments_disabled'], 'target':'likes', 'agg':'mean_ratio'}, 

    {'groupby': ['comments_disabled'], 'target':'dislikes', 'agg':'mean_diff'}, 
    {'groupby': ['comments_disabled'], 'target':'dislikes', 'agg':'mean_ratio'},    
       
    {'groupby': ['comments_disabled'], 'target':'comment_count', 'agg':'mean_diff'}, 
    {'groupby': ['comments_disabled'], 'target':'comment_count', 'agg':'mean_ratio'},     
    
   ]

df = agg(df,agg_cols)

100%|██████████| 129/129 [00:21<00:00,  5.98it/s]

CPU times: user 20.8 s, sys: 442 ms, total: 21.2 s
Wall time: 21.6 s





In [19]:
%%time
text_cols = ['channelTitle','description','title','new_tags']
for cols in text_cols:   
    df[cols] = df[cols].astype(str) 
 
    df[cols +'_music'] = df[cols].apply(lambda x: 1 if 'music' in x.lower() else 0)
    df[cols +'_official'] = df[cols].apply(lambda x: 1 if 'official' in x.lower() else 0)
    df[cols +'_ja_official'] = df[cols].apply(lambda x: 1 if '公式' in x else 0) 
    df[cols +'_cm'] = df[cols].apply(lambda x: 1 if 'cm' in x.lower() else 0)     
    df[cols +'_http'] = df[cols].apply(lambda x: 1 if 'http' in x.lower() else 0)    
    df[cols +'_movie'] = df[cols].apply(lambda x: 1 if 'movie' in x.lower() else 0)    
    df[cols +'_jp'] = df[cols].apply(lambda x: 1 if 'jp' in x.lower() else 0)     
    df[cols +'_youtube'] = df[cols].apply(lambda x: 1 if 'youtube' in x.lower() else 0)         
    df[cols +'_jp_movie'] = df[cols].apply(lambda x: 1 if '映画' in x else 0)      
    df[cols +'_jp_director'] = df[cols].apply(lambda x: 1 if '監督' in x else 0)       
    df[cols +'_jp_tohaku'] = df[cols].apply(lambda x: 1 if '東宝' in x else 0)

CPU times: user 3 s, sys: 42.3 ms, total: 3.04 s
Wall time: 3.17 s


In [28]:
from sklearn.model_selection import StratifiedKFold

df2 = df.copy()
train_df = df2[df2['y'].notnull()] 
train_df['y'] = np.log1p(train_df['y'])
test_df = df2[df2['y'].isnull()]


def drop_features(train_df):
    drop_cols = []
    for c in train_df.columns:
        if "like" not in c:
            drop_cols.append(c)
    
    cols = [f for f in train_df.columns if f not in drop_cols]
    likes = []
    for c in cols:
        if "dislike" not in c:
            likes.append(c)

    diff_likes_dislikes = []
    for c in cols:
        if c.count("like") == 2 and c.count("dislike") == 1:
            diff_likes_dislikes.append(c)
            
    features = likes + diff_likes_dislikes
    
    return features

seed = 817
folds = StratifiedKFold(n_splits=8, shuffle=True, random_state=seed)
target = 'y'

cat_features = [
    'categoryId',
    'channelTitle_encoder', 
    'comments_ratings_encoder',
    'c_month', 
    'year', 
]
features = drop_features(train_df) + cat_features

In [29]:
###### %%time
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return (mean_squared_error(y_true, y_pred))** .5

def target_encoder_kfold(train_df,test_df,col,target,folds,method):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, train_df['y_bin'])):
        print ('FOLD:' + str(n_fold))
        train_x = train_df.iloc[train_idx]
        valid_x = train_df.iloc[valid_idx] 
        if method == 'mean':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].apply(lambda x: stats.trim_mean(x, 0.01)))
        if method == 'median':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].median())   
        if method == 'max':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].max())  
        if method == 'min':
            oof_preds[valid_idx] = valid_x[col].map(train_x.groupby(col)[target].min())              
    if method == 'mean':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].apply(lambda x: stats.trim_mean(x, 0.01)))
    if method == 'median':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].median())
    if method == 'max':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].max())
    if method == 'min':    
        sub_preds = test_df[col].map(train_df.groupby(col)[target].min())        
    return oof_preds,sub_preds


def lgb_k_fold(train_df, test_df, features, target, cat_features, folds, params, use_pseudo=False, sampling=False):
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    cv_list = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[features], train_df['y_bin'])):
        print ('FOLD:' + str(n_fold))
        
        train_x, train_y = train_df[features].iloc[train_idx], train_df[target].iloc[train_idx]
        valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx]
        
        # remove outliers
        if sampling is True:
            valid_x, valid_y = train_df[features].iloc[valid_idx], train_df[target].iloc[valid_idx] 
            train_df_new = train_df.iloc[train_idx]
            train_df_new = train_df_new[train_df_new['y_residual']<3.2]#3
            train_x, train_y = train_df_new[features], train_df_new[target]
            
       # pseudo 
        if use_pseudo is True:
            train_x = pd.concat([train_x, pseudo[features]], axis=0)
            train_y = train_y.append(pseudo[target])  
            for n, (pseudo_train_idx, pseudo_valid_idx) in enumerate(folds.split(pseudo[features], pseudo['y_bin'])):
                print ('PSEUDO FOLD:' + str(n))
                if n_fold == n:
                    train_x = pd.concat([train_x, pseudo[features].iloc[pseudo_valid_idx]],axis=0)
                    train_y = train_y.append(pseudo[target].iloc[pseudo_valid_idx] )
                    break
                    
        print ('train_x shape:',train_x.shape,train_y.mean())
        print ('valid_x shape:',valid_x.shape,valid_y.mean())
        
        dtrain = lgb.Dataset(train_x, label=train_y,categorical_feature=cat_features)
        dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features)

        bst = lgb.train(params, dtrain, num_boost_round=50000,
            valid_sets=[dval,dtrain], verbose_eval=500,early_stopping_rounds=500, )

        # output feature importance
        feature_importances = sorted(zip(features, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:30]
        for fi in feature_importances:
            print("効果的な特徴量", fi)

        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)
        oof_cv = rmse(valid_y,  oof_preds[valid_idx])
        cv_list.append(oof_cv)
        print (cv_list)
        sub_preds += bst.predict(test_df[features], num_iteration=bst.best_iteration) / folds.n_splits

    cv = rmse(train_df[target],  oof_preds)
    print('Full OOF RMSE %.6f' % cv)

    train_df['lgb_y'] = oof_preds
    test_df['lgb_y'] = sub_preds

    return train_df,test_df,cv

params = {
               "objective" : "regression", 
               "boosting" : "gbdt", 
               "metric" : "rmse",  
               "max_depth": -1,
               "min_data_in_leaf": 10, #10
               "min_gain_to_split": 0.01,#0.01
                "min_child_weight": 0.001,
                "reg_alpha": 0.1, 
                "reg_lambda": 1, #1
               "num_leaves" : 35, #40
               "max_bin" : 300,#300 
              "learning_rate" :0.01,
               "bagging_fraction" : 0.9,
               "bagging_freq" : 1,
               "bagging_seed" : 4590,
               "feature_fraction" : 0.6,#0.6
               "verbosity": -1,
               "boost_from_average": False,
}


# target encoder
train_df['categoryId_target_mean'],test_df['categoryId_target_mean'] = target_encoder_kfold(train_df,test_df,'categoryId',target,folds,'mean')     
train_df['ratings_disabled_target_mean'],test_df['ratings_disabled_target_mean'] = target_encoder_kfold(train_df,test_df,'ratings_disabled',target,folds,'mean')  
train_df['comments_disabled_target_mean'],test_df['comments_disabled_target_mean'] = target_encoder_kfold(train_df,test_df,'comments_disabled',target,folds,'mean') 
train_df['comments_ratings_disabled_target_mean'],test_df['comments_ratings_disabled_target_mean'] = target_encoder_kfold(train_df,test_df,'comments_ratings_disabled',target,folds,'mean')

cat_features = [
    'categoryId',
    'channelTitle_encoder', 
    'comments_ratings_encoder',
    'c_month', 
    'year', 
]



print ('numerical features:', len(features),features)# 

train_lgb,test_lgb,cv = lgb_k_fold(train_df,test_df,features,target,cat_features,folds,params,use_pseudo=False,sampling=False)

out_dir = "out_tmp"
!mkdir -p $out_dir

# submission
train_lgb[['id','lgb_y','y']].to_csv(f'./{out_dir}/likes/train_lgb_0623.csv',index=False)
test_lgb[['id','lgb_y']].to_csv(f'./{out_dir}/likes/test_lgb_0623.csv',index=False)

print(train_lgb[['y','lgb_y']].describe())
print(test_lgb['lgb_y'].describe())
test_lgb['y'] = np.expm1(test_lgb['lgb_y'])
print(test_lgb['y'].describe())
test_lgb[['id','y']].to_csv(f'./{out_dir}/likes/sub_lgb_0623_{cv}.csv',index=False)
test_lgb[['id','y']].head()


FOLD:0
FOLD:1
FOLD:2
FOLD:3
FOLD:4
FOLD:5
FOLD:6
FOLD:7
FOLD:0
FOLD:1
FOLD:2
FOLD:3
FOLD:4
FOLD:5
FOLD:6
FOLD:7
FOLD:0
FOLD:1
FOLD:2
FOLD:3
FOLD:4
FOLD:5
FOLD:6
FOLD:7
FOLD:0
FOLD:1
FOLD:2
FOLD:3
FOLD:4
FOLD:5
FOLD:6
FOLD:7
numerical features: 133 ['likes', 'likes_pred', 'diff_likes', 'original_likes', 'like_per_published_day', 'like_per_published_month', 'like_per_published_year', 'likes_comments', 'ratio_likes_comment_count', 'likes_by_day', 'likes_by_month', 'likes_by_year', 'likes_std_score', 'categoryId_trim_mean_original_likes', 'categoryId_trim_mean_original_likes_rank', 'categoryId_trim_mean_diff_original_likes', 'categoryId_trim_mean_diff_original_likes_rank', 'categoryId_max_diff_original_likes', 'categoryId_max_diff_original_likes_rank', 'categoryId_max_ratio_original_likes', 'categoryId_max_ratio_original_likesrank', 'categoryId_mean_ratio_original_likes', 'categoryId_mean_ratio_original_likes_rank', 'categoryId_mean_diff_original_likes', 'categoryId_mean_diff_original_like

Unnamed: 0,id,y
19720,1,309735.025
19721,2,2078320.11
19722,3,881768.355
19723,4,144202.988
19724,5,216.797921
