### 바뀐부분
- Brunch 가즈아~!! 

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from datetime import timedelta, datetime
import glob
import itertools
from itertools import chain
import json
import os
import re

import matplotlib.pyplot as plt
# import seaborn as sns
import gc

import os
import random
from abc import *

from tqdm import tqdm_notebook

# Modeling

In [2]:
class AbstractRecommend(metaclass=ABCMeta):
    
    def __init__(self):
        pass
    
    @abstractmethod
    def recommend(self):
        pass
    
    def calculate_recommend(self, frame, before_recommend_count, cutoff_recommend_count):
        
        limit_recommend = 100 - before_recommend_count
        # 마지막 모델이거나 cutoff 제한이 없는 경우에는 100-recommend의 개수까지 추천합니다.
        if self.last_model is True or cutoff_recommend_count is -1:
            return frame['article_id'].values[:limit_recommend].tolist()
        
        # 설정한 cutoff 보다 추천대상이 작으면 frame article_id를 그대로 return합니다.
        if frame.shape[0] < cutoff_recommend_count:
            return frame['article_id'].values.tolist()

        # cutoff 보다 추천할 대상이 많으면 cutoff만큼 추천합니다.
        return frame['article_id'].values[:cutoff_recommend_count].tolist() 

In [3]:
class RandomBestRecommend(AbstractRecommend):
    def __init__(self, recommend_frame, cutoff_recommend_count):
        self.recommend_frame = recommend_frame
        self.cutoff_recommend_count = cutoff_recommend_count
        self.last_model = False
        
    def set_last_model(self):
        self.last_model = True
        
    def recommend(self, read_list, user_id, before_recommend_count):
        frame = self.recommend_frame.query("article_id not in @read_list")

        limit_recommend = 100 - before_recommend_count
        # 마지막 모델이거나 cutoff 제한이 없는 경우에는 100-recommend의 개수까지 추천합니다.
        if self.last_model is True or self.cutoff_recommend_count is -1:
            return frame['article_id'].values[:limit_recommend].tolist()
        
        # 설정한 cutoff 보다 추천대상이 작으면 frame article_id를 그대로 return합니다.
        recommend_count = min(limit_recommend, self.cutoff_recommend_count)
        if frame.shape[0] < recommend_count:
            return frame['article_id'].values.tolist()

        # cutoff 보다 추천할 대상이 많으면 cutoff만큼 추천합니다.
        return frame['article_id'].values[:recommend_count].tolist() 

class BrunchRecommend(AbstractRecommend):  
    def __init__(self, user_list, read_frame, read_set=None):
        self.user_list = user_list
        self.read_dict = read_frame.groupby('user_id')['article_id'].apply(list).to_dict()
        self.recommend_result = dict()
        #self.recommend_mixed_result = dict()
        self.all_read_set = set()
        if read_set is not None:
            self.all_read_set = read_set.copy()
    
    def make_result_frame(self):
        temp = pd.DataFrame.from_dict(self.recommend_result).T.reset_index()
        return temp.rename(columns={'index':'user_id'})
    
    def recommend(self, model_list=None):
        try:
            if not model_list:
                raise Exception("model_list는 적어도 한 개 이상 있어야 합니다.")
            
            model_list[-1].set_last_model()
            
            self.recommend_result.clear()
            #self.recommend_mixed_result.clear()
            for user in tqdm_notebook(self.user_list):
                self.recommend_result[user] = list()
                #self.recommend_mixed_result[user] = list()
                # read file에서 user가 이미 읽은 것을 제외합니다.
                try:
                    already_user_read = self.read_dict[user]
                except KeyError as e:
                    already_user_read = []
                
                # model_list를 전달받으면 전달받은 model_list로 추천을 합니다.
                for model in model_list:
                    
                    # 각 모델마다 recommend를 수행합니다.
                    if isinstance(model, RandomBestRecommend) is True:
                        print("RandomBestRecommend")
                        read_list = list(self.all_read_set) + already_user_read
                        r = model.recommend(read_list, user, len(self.recommend_result[user]))
                        print(len(r))
                    else:
                        # user가 읽은 list와 이미 추천했던 결과를 합쳐서 model이 제외할 list를 만듭니다.
                        read_list = self.recommend_result[user].copy()
                        read_list = read_list + already_user_read
                        read_list = list(set(read_list))
                        r = model.recommend(read_list, user, len(self.recommend_result[user]))
                    
                    # recommend
                    self.recommend_result[user] = self.recommend_result[user] + r
                    self.all_read_set = self.all_read_set.union(set(r))
                    #self.recommend_mixed_result[user].append(r)
                    
        except Exception as e:
            print(e)
            raise
            
    def _ndcg(self):
        pass
    def _map(self):
        pass
    def _entropy_diversity(self):
        pass
    def evaluate(self):
        pass

class TimebasedRecommend(AbstractRecommend):
    def __init__(self, user_frame, timebased_frame, cutoff_recommend_count):
        self.timebased_frame = timebased_frame
        self.user_frame = user_frame
        self.cutoff_recommend_count = cutoff_recommend_count
        self.last_model = False
    
    def set_last_model(self):
        self.last_model = True
        
    def recommend(self, read_list, user_id, before_recommend_count):
        user_frame = self.user_frame.loc[self.user_frame['user_id']==user_id]
        from_list = sorted(user_frame['from'].unique())
        if len(from_list) == 0:
            return list()
        
        frame = self.timebased_frame.query("article_id not in @read_list")
        
        frame_list = []
        for t in from_list:
            temp = frame.loc[frame['dt']==t].reset_index(drop=True)
            temp['index'] = range(temp.shape[0])
            frame_list.append(temp)
        
        frame = pd.concat(frame_list)
        frame = frame.sort_values(['index','count'], ascending=[True,False])
        frame = frame.drop_duplicates('article_id', keep='first')
        # 최대 recommend limit를 설정합니다.
        
        limit_recommend = 100 - before_recommend_count
        # 마지막 모델이거나 cutoff 제한이 없는 경우에는 100-recommend의 개수까지 추천합니다.
        if self.last_model is True or self.cutoff_recommend_count is -1:
            return frame['article_id'].values[:limit_recommend].tolist()
        
        # 설정한 cutoff 보다 추천대상이 작으면 frame article_id를 그대로 return합니다.
        recommend_count = min(limit_recommend, self.cutoff_recommend_count)
        if frame.shape[0] < recommend_count:
            return frame['article_id'].values.tolist()

        # cutoff 보다 추천할 대상이 많으면 cutoff만큼 추천합니다.
        return frame['article_id'].values[:recommend_count].tolist() 
        
class CutoffRecommend(AbstractRecommend):
    """
    Cutoff를 가지는 recommend 모델입니다.
    AbstractRecommend를 상속받는 모델은 recommend 함수를 구현해야 합니다.
    
    recommend_frame: 각 추천 모델마다 필요한 전처리된 추천 frame입니다.
    cutoff_recommend_count: 각 모델마다 cutoff값, -1로 설정하면 cutoff 제한이 없이 100 - 이전 모델 추천 개수 까지 추천합니다.
    continous_read: True면 flag_sum을 가지는 연속형 추천 모델이고, False면 flag_sum을 가지지 않는 모델입니다.
    
    기존 현우님 모델은 flag_sum_1,2,3,4,5 라고 되어있었는데 이것을 flag_sum이라는 컬럼에 1,2,3,4,5..n을 추가하는 형태로 변경합니다.
    flag_sum은 반드시 높은 숫자가 좋아야 합니다.
    """
    def __init__(self, recommend_frame, cutoff_recommend_count, userbased_model=True, continous_read=False):
        
        self.recommend_frame = recommend_frame
        self.cutoff_recommend_count = cutoff_recommend_count
        self.continous_read = continous_read
        self.userbased_model = userbased_model
        self.last_model = False
    
    def set_last_model(self):
        self.last_model = True
        
    def recommend(self, read_list, user_id, before_recommend_count):
        """
        parameter
        read_list: 이전 모델까지 추천한 article과 2/22 ~ 3/1일 까지의 읽은 article
        user_id: user별로 추천하는 모델은 user_id를 넘겨줘야 합니다.
        before_count: 이전 모델까지의 추천 개수
        
        return
        list형태의 article_id
        """
        
        # 이전에 추천했던 article을 제거합니다.
        #frame = self.recommend_frame.loc[~self.recommend_frame['article_id'].isin(read_list)]

        # user_id를 사용하는 모델은 해당 user_id만 가져옵니다.
        if self.userbased_model is True:
            frame = self.recommend_frame.query("user_id == @user_id")
            frame = frame.query("article_id not in @read_list")
        else:
            frame = self.recommend_frame.query("article_id not in @read_list")
        
        # flag_sum이 포함된 모델, 연속 추천 모델
        if self.continous_read is True:
            # flag_sum은 높은것이 좋기 때문에 내림차순, count도 높은것이 좋기 때문에 내림차순
            frame = frame.sort_values(by=['flag_sum', 'count'], ascending=[False, False])
            
        limit_recommend = 100 - before_recommend_count
        # 마지막 모델이거나 cutoff 제한이 없는 경우에는 100-recommend의 개수까지 추천합니다.
        if self.last_model is True or self.cutoff_recommend_count is -1:
            return frame['article_id'].values[:limit_recommend].tolist()
        
        # 설정한 cutoff 보다 추천대상이 작으면 frame article_id를 그대로 return합니다.
        recommend_count = min(limit_recommend, self.cutoff_recommend_count)
        if frame.shape[0] < recommend_count:
            return frame['article_id'].values.tolist()

        # cutoff 보다 추천할 대상이 많으면 cutoff만큼 추천합니다.
        return frame['article_id'].values[:recommend_count].tolist() 

### rowwise read 만드는 부분 / 전처리되어 있으면 실행안해도 됩니다.

#### Make Read Frame
https://arena.kakao.com/forum/topics/10 <br>
tip! dataframe 선언해서 loop안에서 concat하면 엄청느린데 list에 append 후 concat하니 훨씬 빠름

In [4]:
# input_read_path = 'res/read/'
# file_list = os.listdir(input_read_path)

# file_list = [f for f in file_list if len(f.strip().split('.')) == 1]

# read_df_list = []
# for file in tqdm_notebook(file_list):
#     file_path = input_read_path + file
#     df_temp = pd.read_csv(file_path, header=None, names=['raw'])
#     df_temp['from'] = file.split('_')[0]
#     df_temp['to'] = file.split('_')[1]
#     read_df_list.append(df_temp)
    
# read_df = pd.concat(read_df_list)

# read_df['user_id'] = read_df['raw'].apply(lambda x: x.split(' ')[0])
# read_df['article_id'] = read_df['raw'].apply(lambda x: x.split(' ')[1:])

# def chainer(s):
#     return list(itertools.chain.from_iterable(s))

# read_cnt_by_user = read_df['article_id'].map(len)
# read_rowwise = pd.DataFrame({'from': np.repeat(read_df['from'], read_cnt_by_user),
#                          'to': np.repeat(read_df['to'], read_cnt_by_user),
#                          'user_id': np.repeat(read_df['user_id'], read_cnt_by_user),
#                          'article_id': chainer(read_df['article_id'])})

# #read_rowwise = read_rowwise.loc[read_rowwise['article_id']!='']
# read_rowwise.reset_index(drop=True, inplace=True)

# read_rowwise.to_csv('res/read_rowwise_v2.csv', index=False)

### Make Recommend Frame Func

In [5]:
def get_how_many_read_by_eachuser(read):
    """
    eda_table
    how_many_read_by_eachuser = pd.DataFrame(read.groupby(['user_id'])['author_id'].value_counts())
    how_many_read_by_eachuser.columns = ['count']
    how_many_read_by_eachuser = how_many_read_by_eachuser.reset_index()
    """
    how_many_read_by_eachuser = pd.DataFrame(read.groupby(['user_id'])['author_id'].value_counts())
    how_many_read_by_eachuser.columns = ['count']
    how_many_read_by_eachuser = how_many_read_by_eachuser.reset_index()
    return how_many_read_by_eachuser

def get_how_many_write(read):
    """
    eda_table1
    how_many_write_df = pd.DataFrame(read.groupby(['author_id'])['article_id'].agg({'nunique'}))
    how_many_write_df.columns = ['author_article_write_cnt']
    how_many_write_df = how_many_write_df.reset_index()
    """
    how_many_write_df = pd.DataFrame(read.groupby(['author_id'])['article_id'].agg({'nunique'}))
    how_many_write_df.columns = ['author_article_write_cnt']
    how_many_write_df = how_many_write_df.reset_index()
    return how_many_write_df

def get_how_many_read_repeat(read):
    """
    eda_table2
    how_many_read_repeat_df = pd.DataFrame(read.groupby(['user_id','author_id'])['dt'].agg({'nunique'}))
    how_many_read_repeat_df.columns = ['count']
    how_many_read_repeat_df = how_many_read_repeat_df.reset_index()
    """
    how_many_read_repeat_df = pd.DataFrame(read.groupby(['user_id','author_id'])['dt'].agg({'nunique'}))
    how_many_read_repeat_df.columns = ['count']
    how_many_read_repeat_df = how_many_read_repeat_df.reset_index()
    return how_many_read_repeat_df

def get_how_many_read_each_article_by_eachuser(read):
    """
    eda_table4 = pd.DataFrame(df_table.groupby(['user_id','author_id'])['article_id'].agg({'nunique'}))
    eda_table4.columns = ['count']
    eda_table4 = eda_table4.reset_index()
    """
    how_many_read_each_article_by_eachuser = pd.DataFrame(read.groupby(['user_id','author_id'])['article_id'].agg({'nunique'}))
    how_many_read_each_article_by_eachuser.columns = ['count']
    how_many_read_each_article_by_eachuser = how_many_read_each_article_by_eachuser.reset_index()
    return how_many_read_each_article_by_eachuser    

def get_how_many_read(read):
    """
    sub_table
    how_many_read = pd.DataFrame(read['article_id'].value_counts()).reset_index()
    how_many_read.columns = ['article_id','count']
    how_many_read['author_id'] = how_many_read['article_id'].astype(str).apply(lambda x: x.split('_')[0])
    """
    how_many_read = pd.DataFrame(read['article_id'].value_counts()).reset_index()
    how_many_read.columns = ['article_id','count']
    how_many_read['author_id'] = how_many_read['article_id'].astype(str).apply(lambda x: x.split('_')[0])
    return how_many_read

def get_how_many_read_by_variableuser_author(read):
    """
    sub_table1
    temp = pd.DataFrame(read.groupby('author_id')['user_id'].agg({'nunique'})).reset_index()
    temp.columns = ['author_id','nunique']
    """
    temp = pd.DataFrame(read.groupby('author_id')['user_id'].agg({'nunique'})).reset_index()
    temp.columns = ['author_id','nunique']
    return temp

def get_how_many_read_by_variableuser_article(read):
    """
    sub_table2
    temp = pd.DataFrame(read.groupby('article_id')['user_id'].agg({'nunique'})).reset_index()
    temp.columns = ['article_id','nunique']
    temp = temp.sort_values(by='nunique',ascending=False)
    """
    temp = pd.DataFrame(read.groupby('article_id')['user_id'].agg({'nunique'})).reset_index()
    temp.columns = ['article_id','nunique']
    temp = temp.sort_values(by='nunique',ascending=False)
    return temp

def recent_following_article(following_favor, col1, col2):
    # groupby sort_values 순서가 ascending=True라서 -1곱해서 바꿔줌. 
    following_favor['count'] = following_favor[col1] * -1 / following_favor[col2]
    
    # count는 내림차순(-1을 곱한 상태에서 내림차순), reg_dt는 오름차순으로 되는데 =>  
    following_favor = following_favor.groupby(['user_id', 'count']).apply(lambda x: x.sort_values(by ='diffday_from_end')).reset_index(drop=True)
    following_favor['count'] = following_favor['count'] * -1
    return following_favor

def read_preprocessing(read_data, metadata, read_period):
    # period는 (] 형식으로 start날짜는 포함하고 end날짜는 포함하지 않는다.
    read_cutoff_start = read_period[0] # begin
    read_cutoff_end = read_period[1] # end
    read_index = (read_data['dt'] >= read_cutoff_start) & (read_data['dt'] < read_cutoff_end)
    
    read = read_data.loc[read_index] # read cut off
    read = read.loc[read['article_id'].isin(metadata['id'])] # meta에 없는 read 제거
    read['author_id'] = read['article_id'].astype(str).apply(lambda x: x.split('_')[0])
    return read

def meta_preprocessing(metadata, meta_period, use_megazine=False, use_regdt=False):
    """
    selected_column = ['following_id', 'article_id', 'diffday_from_end']
    if use_megazine is True:
        selected_column = ['following_id','article_id','magazine_id','diffday_from_end']
    if use_regdt is True:
        selected_column = selected_column + ['reg_dt']
    """
    # period는 (] 형식으로 start날짜는 포함하고 end날짜는 포함하지 않는다.
    meta_cutoff_start = meta_period[0] # begin
    meta_cutoff_end = meta_period[1] # end
    meta_cutoff_index = (pd.to_datetime(metadata['reg_dt']) >= meta_cutoff_start) & (pd.to_datetime(metadata['reg_dt']) < meta_cutoff_end)
    
    new_meta = metadata.loc[meta_cutoff_index]
    new_meta['diffday_from_end'] = (meta_cutoff_end - pd.to_datetime(new_meta['reg_dt'])).dt.days
    
    # column rename
    del new_meta['article_id']
    new_meta.rename(columns={'user_id':'following_id','id':'article_id'}, inplace=True)
    
    # select column
    selected_column = ['following_id', 'article_id', 'diffday_from_end']
    if use_megazine is True:
        selected_column = ['following_id','article_id','magazine_id','diffday_from_end']
    if use_regdt is True:
        selected_column = selected_column + ['reg_dt']
    new_meta = new_meta[selected_column]
    return new_meta
    
def rank_preprocessing(following_favor):
    following_favor_null = following_favor.loc[following_favor['rank'].isnull()].reset_index(drop=True)
    following_favor_not_null = following_favor.loc[following_favor['rank'].notnull()].reset_index(drop=True)
    
    # 3월 글
    following_favor_null = following_favor_null.loc[following_favor_null['diffday_from_end']<=15]
    
    # 많이 읽힌글 가져오기
    following_favor_not_null = following_favor_not_null.loc[following_favor_not_null['rank']<10].reset_index(drop=True)
    
    temp = pd.concat([following_favor_null,following_favor_not_null], axis=0)
    return temp

In [6]:
# 유저별 선호도가 보정된 데이터를 추출하기 위한 함수
def count_correlction_read_favor(read_data, metadata, user_id_frame,
                                 meta_period=(pd.datetime(2019, 2, 14), pd.datetime(2019, 3, 15)), 
                                 read_period=(20190214, 20190301),
                                 favor_cutoff=0.05 ):
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)

    # meta 전처리
    new_meta = meta_preprocessing(metadata, meta_period, use_megazine=True)
    new_meta['diffday_from_end'] = new_meta['diffday_from_end']-14
    new_meta = new_meta.loc[new_meta['diffday_from_end'] > 0]
    new_meta['diffday_from_end'] = new_meta['diffday_from_end'].apply(lambda x: 15 if x >= 15 else x)
    
    # sub_table article이 얼마나 많이 읽혔는지, return ['article_id', 'count', 'author_id']
    sub_table = get_how_many_read(read) 
    sub_table.rename(columns={'count':'how_many_read'}, inplace=True)

    # eda_table user별로 작가의 글을 몇번 읽었는지, return ['user_id', 'author_id', 'count']
    eda_table = get_how_many_read_by_eachuser(read) 

    # eda_table1 작가가 글을 몇개 썼는지 ['author_id', 'article_id', 'author_article_write_cnt']
    eda_table1 = get_how_many_write(read) 
    
    # eda_tabl4 user별로 작가의 글을 얼마나 다양하게 읽었는데 reuturn ['user_id', 'author_id', 'count']
    eda_table4 = get_how_many_read_each_article_by_eachuser(read) 

    # sub_table1 작가의 글을 다양한 유저가 읽었는지 ['author_id', 'user_id', 'nunique']
    sub_table1 = get_how_many_read_by_variableuser_author(read) # sub_table1
    
    df_table1 = pd.merge(eda_table4, sub_table, on='author_id') #eda_table: 유저가 author_id의 글을 몇번 봤는지 # eda_table4 : 유저가 author_id의 글을 몇개나 봤는지 
    df_table1 = pd.merge(df_table1, sub_table1, on='author_id') #sub_table1 :author_id별로 글을 읽는 user는 몇명인지
    #df_table1 = pd.merge(df_table1, eda_table1, on='author_id', how='left') #sub_table1 : author_id별로 글을 읽는 user는 몇명인지
    df_table1 = pd.merge(df_table1, new_meta, on='article_id', how='left')
    
    df_table1['correction_count'] = (df_table1['count'] * df_table1['how_many_read'])/(df_table1['nunique']) #nunique: 이 작가의 글을 읽는 사람의 수 <- ??
    
    df_table2 = df_table1.sort_values(by='correction_count' ,ascending=False)
    df_table3 = df_table2[['user_id','article_id','correction_count']]
    df_table3 = df_table3.dropna(axis=0)
    
    dev1 = user_id_frame.merge(df_table3, on='user_id', how='left')
    dev2 = dev1.groupby('user_id')['user_id'].agg({'size'}).reset_index().sort_values('size')
    dev1 = pd.merge(dev1, dev2, how='left',on='user_id')
    
    read_user = dev1.loc[(dev1['correction_count'].notnull())]
    dontread_user = dev1.loc[(dev1['correction_count'].isnull())]
    read_user = read_user.loc[read_user['correction_count'] > read_user['correction_count'].quantile(favor_cutoff)]
    print("read_user", read_user.shape)
    print("dontread_user", dontread_user.shape)
    return read_user, dontread_user

In [7]:
# 구독자 선호도
def following_favor_frame(read_data, metadata, following_table, best, 
                          correction_type=0,
                          meta_period=(pd.datetime(2019, 2, 22), pd.datetime(2019, 3, 15)), 
                          read_period=(20190214, 20190301),
                          favor_cutoff=0.05):
    
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    
    # meta 전처리
    new_meta = meta_preprocessing(metadata, meta_period)

    # 내가 구독하는 작가의 글 중에서 최신글
    following_favor = following_table.copy()
    following_favor = pd.merge(following_favor, new_meta, how='left', on='following_id')
    following_favor = following_favor.loc[following_favor['article_id']!='']
    following_favor = following_favor.loc[following_favor['following_id']!='']
    following_favor = following_favor.dropna(axis=0)
    following_favor.columns = ['user_id', 'author_id', 'article_id', 'diffday_from_end']
    
    # count 보정을 위한 read에서 통계값 구하기
    if correction_type == 0:
        correction_df1 = get_how_many_read_by_eachuser(read)
    elif correction_type == 1:
        correction_df1 = get_how_many_read_repeat(read)
    else :
        correction_df1 = get_how_many_read_each_article_by_eachuser(read)
        
    how_many_write_df = get_how_many_write(read)
    
    following_favor = following_favor.merge(correction_df1, how='left', on=['user_id','author_id'])
    following_favor = following_favor.merge(how_many_write_df, how='left', on=['author_id'])
    following_favor = following_favor.merge(best, how='left',on=['article_id','author_id'])
    following_favor = following_favor[following_favor['count'].notnull()]
    
    # read count 보정 후, 최근 article sort
    following_favor = recent_following_article(following_favor, 'count', 'author_article_write_cnt')
    
    # 하위 5% cutoff
    following_favor = following_favor.loc[following_favor['count'] > following_favor['count'].quantile(favor_cutoff)] 
    # best에서 전달받은 rank 전처리
    following_favor = rank_preprocessing(following_favor)
    # reg_dt가 3월인 부분 제거 
    following_favor = following_favor[following_favor['diffday_from_end'].notnull()].reset_index(drop=True)
    # count로 sort
    following_favor = following_favor.sort_values(by=['count','rank','diffday_from_end'], ascending=[False,True,False])
    return following_favor

In [8]:
# 구독자 선호도
def dont_following_favor_frame(read_data, metadata, following_table, best, 
                          correction_type=0,
                          meta_period=(pd.datetime(2019, 2, 22), pd.datetime(2019, 3, 15)), 
                          read_period=(20190214, 20190301),
                          favor_cutoff=0.05):
    
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    
    # meta 전처리
    new_meta = meta_preprocessing(metadata, meta_period)

    # 구독한 정보를 토대로 구독하지 않은 작가들의 글만 추출할 것임. 
    following = following_table[['user_id','following_id']]
    following = following.drop_duplicates(subset=['user_id','following_id'])
    following['following'] = 1

    # 구독하지 않은 작가의 읽은 정보
    dont_following_favor = pd.DataFrame()
    dont_following_favor['user_id'] = read['user_id']
    dont_following_favor['following_id'] = read['article_id'].apply(lambda x: x.split('_')[0])
    dont_following_favor = pd.merge(dont_following_favor,following,how='left',on=['user_id','following_id'])
    dont_following_favor = dont_following_favor[dont_following_favor['following'].isnull()].reset_index(drop=True)
    del dont_following_favor['following']
    
    # 내가 구독하는 작가의 글 중에서 최신글
    dont_following_favor = pd.merge(dont_following_favor, new_meta, how='left', on='following_id')
    dont_following_favor = dont_following_favor.loc[dont_following_favor['article_id']!='']
    dont_following_favor = dont_following_favor.loc[dont_following_favor['following_id']!='']
    dont_following_favor = dont_following_favor.dropna(axis=0)
    dont_following_favor.columns = ['user_id', 'author_id', 'article_id', 'diffday_from_end']
    dont_following_favor = dont_following_favor.drop_duplicates(['user_id','article_id'])

    # count 보정을 위한 read에서 통계값 구하기
    if correction_type == 0:
        correction_df1 = get_how_many_read_by_eachuser(read)
    elif correction_type == 1:
        correction_df1 = get_how_many_read_repeat(read)
    else :
        correction_df1 = get_how_many_read_each_article_by_eachuser(read)
        
    how_many_write_df = get_how_many_write(read)
    
    dont_following_favor = dont_following_favor.merge(correction_df1, how='left', on=['user_id','author_id'])
    dont_following_favor = dont_following_favor.merge(how_many_write_df, how='left', on=['author_id'])
    dont_following_favor = dont_following_favor.merge(best, how='left',on=['article_id','author_id'])
    dont_following_favor = dont_following_favor[dont_following_favor['count'].notnull()]
    
    # read count 보정 후, 최근 article sort
    dont_following_favor = recent_following_article(dont_following_favor, 'count', 'author_article_write_cnt')
    
    # 하위 5% cutoff
    dont_following_favor = dont_following_favor.loc[dont_following_favor['count'] > dont_following_favor['count'].quantile(favor_cutoff)] 
    # best에서 전달받은 rank 전처리
    dont_following_favor = rank_preprocessing(dont_following_favor)
    # reg_dt가 3월인 부분 제거 
    dont_following_favor = dont_following_favor[dont_following_favor['diffday_from_end'].notnull()].reset_index(drop=True)
    # count로 sort
    dont_following_favor = dont_following_favor.sort_values(by=['count','rank','diffday_from_end'], ascending=[False,True,False])
    return dont_following_favor

In [9]:
def shift_preprocessing(series, groupby_col, series_count):
    new_col_list = []
    for shift_num in range(1, series_count+1):
        new_col = f'lag_readYN_{shift_num}'
        new_col_list.append(new_col)
        series[new_col] = series.groupby(groupby_col)['readYN'].shift(-1*shift_num)
    
    new_readYN_sum_list = []
    for shift_num in range(1, series_count):
        new_col = f'readYN_sum_{shift_num}'
        new_readYN_sum_list.append(new_col)
        
        previous_col = f'readYN_sum_{shift_num-1}'
        if shift_num == 1:
            series[new_col] = series[new_col_list[shift_num]] + series[new_col_list[shift_num-1]]
        else:
            series[new_col] = series[previous_col] + series[new_col_list[shift_num]]
    
    series['flag_sum'] = series[new_readYN_sum_list[1:]].max(axis=1)
    series = series.loc[(series['readYN_sum_1']==2) & (series['readYN']==0)].reset_index(drop=True)
    return series

def get_weekly_metadata(metadata, 
                        meta_period=(pd.datetime(2017, 7, 27), pd.datetime(2019, 3, 15))):
    
    # meta 전처리
    # return ['following_id','article_id','magazine_id','diffday_from_end', 'reg_dt']
    new_meta = meta_preprocessing(metadata, meta_period, use_megazine=True, use_regdt=True)
    new_meta = new_meta.loc[new_meta['magazine_id'] != 0]
    new_meta['reg_dt_dayofweek'] = pd.to_datetime(new_meta['reg_dt']).dt.dayofweek
    
    # 최신것으로 sort
    new_meta = new_meta.sort_values(by='diffday_from_end', ascending=True)
    
    # dayofweek의 unique한 개수
    metadata_weekly = new_meta.groupby(['magazine_id'])['reg_dt_dayofweek'].agg({'nunique'}).reset_index()
    # 3월 15일부터 글을 올린 날짜의 차이의 unique한 개수
    metadata_reg_dt = new_meta.groupby(['magazine_id'])['diffday_from_end'].agg({'nunique'}).reset_index()
    
    metadata_temp = pd.merge(new_meta, metadata_weekly, on='magazine_id', how='left')
    metadata_temp = pd.merge(metadata_temp, metadata_reg_dt, on='magazine_id', how='left')
    
    # *5주 이상동안 같은 날 연재한 글*
    # nunique_x: reg_dt_dayofweek / nunique_y: diffday_from_end
    # 같은날 연재되면서 diffday가 5개 이상이라는 것을 weekly megazine으로 봄
    # 연속적이지 않을 수 있는데, 그것은 따로 판단
    weekly_megazine = metadata_temp.loc[(metadata_temp['nunique_x']==1) & (metadata_temp['nunique_y']>=5)]
    weekly_megazine['weekly'] = 1
    weekly_megazine = weekly_megazine[['magazine_id','article_id','weekly','diffday_from_end']]
    weekly_megazine['following_id'] = weekly_megazine['article_id'].apply(lambda x: x.split('_')[0])
    return weekly_megazine

def weekly_magazine_series(read_data, metadata, following, 
                           meta_period=(pd.datetime(2017,7, 14), pd.datetime(2019, 3, 15)), 
                           read_period=(20190207, 20190301),
                           series_count=6):
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    print("read")
    # get weekly article
    weekly_meta = get_weekly_metadata(metadata)
    print("weekly_meta")
    # weekly 매거진 추천도 내가 구독한 작가만 사용
    following_table = following.copy()
    magazine_table = following_table.merge(weekly_meta, on=['following_id'], how='left')
    print("magazine_table")
    # following table이 user_id, following_id / weekly_meta가 'magazine_id','article_id','weekly','diffday_from_end', following_id
    # following_id를 user_id로 변경
    magazine_table.columns = ['user_id','author_id','magazine_id','article_id','weekly','diffday_from_end']
    magazine_table = magazine_table.dropna(axis=0)
    
    check_reading_table = read[['user_id','article_id']] # df_table은 읽은 것만. 
    check_reading_table['readYN'] = 1
    magazine_table = pd.merge(magazine_table, check_reading_table, how='left',on=['user_id','article_id'])
    magazine_table['readYN'] = magazine_table['readYN'].fillna(0)
    
    magazine_table = magazine_table.sort_values(by='diffday_from_end') #날짜 
    magazine_table = magazine_table.drop_duplicates(['user_id','article_id'])
    
    magazine_table = shift_preprocessing(magazine_table, ['user_id','magazine_id'], series_count)
    
    how_many_read = get_how_many_read_by_eachuser(read)
    magazine_table = pd.merge(magazine_table, how_many_read, how='left', on=['user_id','author_id'])
    return magazine_table
    
    
def magazine_series(read_data, metadata, following_table, 
                    meta_period=(pd.datetime(2019, 2, 14), pd.datetime(2019, 3, 15)), 
                    read_period=(20190207, 20190301),
                    series_count=7):
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    
    # meta 전처리
    new_meta = meta_preprocessing(metadata, meta_period, use_megazine=True)
    
    following = following_table.copy()
    series_table = pd.merge(following, new_meta, how='left', on='following_id') # 읽은 글하고 안읽은 글이 섞여요. 
    series_table = series_table.loc[series_table['article_id']!='']
    series_table = series_table.loc[series_table['following_id']!='']
    series_table = series_table.dropna(axis=0)
    series_table.columns = ['user_id','author_id','article_id','magazine_id','diffday_from_end']
    
    check_reading_table = read[['user_id','article_id']] # df_table은 읽은 것만. 
    check_reading_table['readYN'] = 1
    series_table = pd.merge(series_table, check_reading_table, how='left',on=['user_id','article_id'])
    series_table['readYN'] = series_table['readYN'].fillna(0)
    
    series_table = series_table.sort_values(by='diffday_from_end') #날짜 
    series_table = series_table.drop_duplicates(['user_id','article_id'])
    
    series_table = shift_preprocessing(series_table, ['user_id','author_id','magazine_id'], series_count)
    
    how_many_read = get_how_many_read_by_eachuser(read)
    series_table = pd.merge(series_table, how_many_read, how='left',on=['user_id','author_id'])
    
    return series_table

In [10]:
def dont_following_magazine_series(read_data, metadata, following_table, 
                    meta_period=(pd.datetime(2019, 2, 14), pd.datetime(2019, 3, 15)), 
                    read_period=(20190214, 20190301),
                    series_count=7):
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    
    # meta 전처리
    new_meta = meta_preprocessing(metadata, meta_period, use_megazine=True)
    
    # 구독한 정보를 토대로 구독하지 않은 작가들의 글만 추출할 것임. 
    following = following_table[['user_id','following_id']]
    following = following.drop_duplicates(subset=['user_id','following_id'])
    following['following'] = 1
    
    # 구독하지 않은 작가의 읽은 정보
    series_table = pd.DataFrame()
    series_table['user_id'] = read['user_id']
    series_table['following_id'] = read['article_id'].apply(lambda x: x.split('_')[0])
    series_table = pd.merge(series_table,following,how='left',on=['user_id','following_id'])
    series_table = series_table[series_table['following'].isnull()].reset_index(drop=True)
    del series_table['following']
    
    # 아래부터는 기존과 동일 
    series_table = pd.merge(series_table, new_meta, how='left', on='following_id') # 읽은 글하고 안읽은 글이 섞여요. 
    series_table = series_table.loc[series_table['article_id']!='']
    series_table = series_table.loc[series_table['following_id']!='']
    series_table = series_table.dropna(axis=0)
    series_table.columns = ['user_id','author_id','article_id','magazine_id','diffday_from_end']
    series_table = series_table.drop_duplicates(['user_id','article_id'])

    check_reading_table = read[['user_id','article_id']] # df_table은 읽은 것만. 
    check_reading_table['readYN'] = 1
    series_table = pd.merge(series_table, check_reading_table, how='left',on=['user_id','article_id'])
    series_table['readYN'] = series_table['readYN'].fillna(0)
    
    series_table = series_table.sort_values(by='diffday_from_end') #날짜 
    series_table = series_table.drop_duplicates(['user_id','article_id'])
    
    series_table = shift_preprocessing(series_table, ['user_id','author_id','magazine_id'], series_count)
    
    # 선호도로 얼마나 작가의 다양한 글을 읽었는지로 판단.  
    how_many_each_article_read = get_how_many_read_each_article_by_eachuser(read)
    series_table = pd.merge(series_table, how_many_each_article_read, how='left',on=['user_id','author_id'])
    
    return series_table

In [11]:
def dont_following_weekly_series(read_data, metadata, following_table, 
                    meta_period=(pd.datetime(2017, 7, 14), pd.datetime(2019, 3, 15)), 
                    read_period=(2019027, 20190301),
                    series_count=6):
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    
    # get weekly article
    weekly_meta = get_weekly_metadata(metadata)
    
    # meta 전처리
    
    # 구독한 정보를 토대로 구독하지 않은 작가들의 글만 추출할 것임. 
    following = following_table[['user_id','following_id']]
    following = following.drop_duplicates(subset=['user_id','following_id'])
    following['following'] = 1
    
    # 구독하지 않은 작가의 읽은 정보
    series_table = pd.DataFrame()
    series_table['user_id'] = read['user_id']
    series_table['following_id'] = read['article_id'].apply(lambda x: x.split('_')[0])
    series_table = pd.merge(series_table,following,how='left',on=['user_id','following_id'])
    series_table = series_table[series_table['following'].isnull()].reset_index(drop=True)
    del series_table['following']
    
    # Weekly 글만 추출 
    series_table = series_table.merge(weekly_meta, on=['following_id'], how='left')
    
    # 아래부터는 기존과 동일 
    series_table = series_table.loc[series_table['article_id']!='']
    series_table = series_table.loc[series_table['following_id']!='']
    series_table = series_table.dropna(axis=0)
    
    #['user_id', 'following_id', 'magazine_id', 'article_id', 'weekly','diffday_from_end']    
    series_table.columns = ['user_id','author_id','magazine_id','article_id','weekly','diffday_from_end']
    series_table = series_table.drop_duplicates(['user_id','article_id'])

    check_reading_table = read[['user_id','article_id']] # df_table은 읽은 것만. 
    check_reading_table['readYN'] = 1
    series_table = pd.merge(series_table, check_reading_table, how='left',on=['user_id','article_id'])
    series_table['readYN'] = series_table['readYN'].fillna(0)
    
    series_table = series_table.sort_values(by='diffday_from_end') #날짜 
    series_table = series_table.drop_duplicates(['user_id','article_id'])
    
    series_table = shift_preprocessing(series_table, ['user_id','author_id','magazine_id'], series_count)
    
    # 선호도로 얼마나 작가의 다양한 글을 읽었는지로 판단.  
    how_many_each_article_read = get_how_many_read_each_article_by_eachuser(read)
    series_table = pd.merge(series_table, how_many_each_article_read, how='left',on=['user_id','author_id'])
    
    return series_table

In [12]:
def get_regression_best(read_data, metadata, 
                        regresssion_meta_period=(pd.datetime(2019, 2, 1), pd.datetime(2019, 3, 1)), 
                        regression_read_period=(20190201, 20190301)):
    # regression_read2
    read_regression = read_preprocessing(read_data, metadata, regression_read_period)
    read_regression = read_regression['article_id'].value_counts().reset_index()
    read_regression.columns = ['article_id', 'value_counts']
    
    # regresssion_meta2 ['following_id', 'article_id', 'diffday_from_end', 'reg_dt']
    meta_regression = meta_preprocessing(metadata, regresssion_meta_period, use_megazine=False, use_regdt=True)
    meta_regression = meta_regression[['article_id','reg_dt']]
    
    regression_best = pd.merge(read_regression, meta_regression, how='left',on='article_id')
    regression_best['diff_reg_datetime'] = pd.datetime(2019, 3, 1, 0, 0) - pd.to_datetime(regression_best['reg_dt'])
    regression_best = regression_best.dropna(axis=0)

    # 차이가 나는 날짜 계산
    regression_best['diff_reg_hr'] = 24*regression_best['diff_reg_datetime'].dt.days + pd.to_datetime(regression_best['diff_reg_datetime']).dt.hour
    regression_best['count'] = regression_best['value_counts'] / regression_best['diff_reg_hr']

    # 작가 별 평균 count 계산하기. ( count = 시간당 사람들이 글을 읽는 횟수) 
    regression_best['author_id'] = regression_best['article_id'].apply(lambda x: x.split('_')[0])
    regression_best = regression_best.groupby(['author_id'])['count'].agg({'mean'}).reset_index()
    
    return regression_best

def regression_march(read_data, metadata, following_table,
                        meta_period=(pd.datetime(2019, 3, 1), pd.datetime(2019, 3, 15)), 
                        read_period=(20190214, 20190301),
                        regresssion_meta_period=(pd.datetime(2019, 2, 1), pd.datetime(2019, 3, 1)), 
                        regression_read_period=(20190201, 20190301)):
    
    regression_best = get_regression_best(read_data, metadata, regresssion_meta_period, regression_read_period)
    
    # meta 전처리 diffday_from_end
    new_meta = meta_preprocessing(metadata, meta_period, use_megazine=False, use_regdt=True)
    new_meta = new_meta[['article_id','reg_dt']]
    new_meta['reg_dt'] = pd.to_datetime(new_meta['reg_dt'])
    new_meta['diff_reg_datetime'] = pd.datetime(2019, 3, 15, 0, 0) - new_meta['reg_dt']
    new_meta['diff_reg_hr'] = 24*new_meta['diff_reg_datetime'].dt.days + pd.to_datetime(new_meta['diff_reg_datetime']).dt.hour
    new_meta['author_id'] = new_meta['article_id'].apply(lambda x: x.split('_')[0])
    
    new_meta = pd.merge(new_meta, regression_best, how='left', on='author_id')
    new_meta['count'] = new_meta['mean'] * new_meta['diff_reg_hr']
    new_meta = new_meta[['article_id','author_id','count']]
    
    # 3월달의 user별 구독작가 정보를 토대로 한 추천결과 만들기. 
    following = following_table.copy()
    following.columns = ['user_id','author_id']
    regression_march = pd.merge(following, new_meta, how='left',on='author_id')
    
    # read 전처리
    read = read_preprocessing(read_data, metadata, read_period)
    # eda_table user별로 작가의 글을 몇번 읽었는지, return ['user_id', 'author_id', 'count']
    eda_table = get_how_many_read_by_eachuser(read) 
    
    regression_march = pd.merge(regression_march, eda_table,how='left',on=['user_id','author_id'])
    regression_march.columns = ['user_id','author_id','article_id','regression_cnt','favor_cnt']
    regression_march = regression_march.sort_values(by=['favor_cnt','regression_cnt'], ascending=[False,False])
    regression_march = regression_march.dropna(axis=0)
    
    return regression_march

In [13]:
def make_timebased_best(readdata, metadata, read_period=(20190221, 20190301)):
    """
    read_period의 시작은 실제보다 하루 작게
    """
    read = read_preprocessing(readdata, metadata , read_period=read_period)
    
    time_based_best = []
    base_read = sorted(read['from'].unique())[23:]
    for item in itertools.zip_longest(base_read, base_read[1:], base_read[2:]):
        try:
            if isinstance(item[1], np.int64) is False:
                raise BaseException(item[1])

            temp = read.loc[read['from'].isin(item),'article_id'].value_counts().reset_index()
            temp.columns = ['article_id','count']
            temp['dt'] = item[1]
        except BaseException as e:
            print(e)
            pass
        else:
            time_based_best.append(temp)
            
    time_based_best = pd.concat(time_based_best)
    """
    result_list = []
    for user_id in tqdm_notebook(user_id_frame['user_id'].unique()):
        from_list = sorted(read.loc[read['user_id']==user_id,'from'].unique())
        for t in from_list:
            temp = time_based_best.loc[time_based_best['dt']==t].reset_index(drop=True)
            temp['user_id'] = user_id
            temp['index'] = range(temp.shape[0])
            result_list.append(temp)
 
    frame = pd.concat(result_list)
    frame = frame.sort_values(['user_id','index','count'],ascending=[False, True, False])
    print(frame.shape)
    return frame
    """
    return time_based_best

In [14]:
def best_correction(readdata, metadata, read_period=(20190201, 20190301)):
    # best model
    most_read_article = read_preprocessing(readdata, metadata ,read_period=read_period) # 2/22 ~ 2/28

    # 가장 많이 읽은 article_id, 마지막 모델에 사용하기 위하여 만듬
    most_read_article = most_read_article['article_id'].value_counts().reset_index()
    most_read_article.columns = ['article_id','value_counts']

    temp = metadata.copy()
    del temp['article_id']
    temp = temp.rename(columns={'id':'article_id'})
    temp = temp[['article_id','reg_dt']]

    most_read_article = most_read_article.merge(temp, on='article_id', how='left')
    most_read_article['dt'] = (pd.datetime(2019, 3, 1, 0, 0) - pd.to_datetime(most_read_article['reg_dt'])).dt.days
    most_read_article['count'] = most_read_article['value_counts']/most_read_article['dt']
    most_read_article = most_read_article.sort_values('count',ascending=False)
    most_read_article = most_read_article[['article_id','count']]
    return most_read_article

### Data Load

In [15]:
read_rowwise = pd.read_csv('res/read_rowwise_v2.csv')
read_rowwise['dt'] = read_rowwise['from']/100
read_rowwise['dt'] = read_rowwise['dt'].astype(int)
read_rowwise = read_rowwise.dropna()

users = pd.read_json('res/users.json', lines=True)
dev_user = pd.read_csv('res/predict/dev.users',header=None, names=['dev'])
test_user = pd.read_csv('res/predict/test.users',header=None, names=['test'])

metadata = pd.read_json('res/metadata.json', lines=True)
# 부정확 할 수 있지만 reg_ts 0으로 두는 것보다는 좋을 것 같아서 1970 이전에 쓴글의 reg_ts를 가져옴
# 물론 article_id가 글의 연재 순서가 아닐 수 있지만 magazine은 거의 연재순서와 일치하고 일반 기사도 향성이 있어서 이렇게 함
metadata = pd.read_json('res/metadata.json', lines=True)
metadata.loc[metadata['reg_ts']==0,'reg_ts'] = np.nan
metadata.sort_values(['user_id','article_id'], inplace=True)
metadata['reg_ts'].fillna(method='bfill', inplace=True)
metadata['reg_dt'] = metadata['reg_ts'].apply(lambda x : datetime.fromtimestamp(x/1000.0))
metadata['reg_dt'] = metadata['reg_dt'].dt.date

#metadata.sort_values(['user_id','article_id'], inplace=True)
#metadata['reg_ts'].fillna(method='bfill', inplace=True)
#metadata['reg_dt'] = metadata['reg_ts'].apply(lambda x : datetime.fromtimestamp(x/1000.0))

# 0301부터 했을 때는 더 나빠졌습니다. 
metadata = metadata.loc[pd.to_datetime(metadata['reg_dt']) <= pd.datetime(2019, 3, 14)]
metadata = metadata.sort_index()
metadata = metadata.reset_index(drop=True)

In [16]:
def chainer(s):
    return list(itertools.chain.from_iterable(s))

following_list_count = users['following_list'].map(len)
following_rowwise = pd.DataFrame({'user_id': np.repeat(users['id'], following_list_count),
                                  'keyword_list': np.repeat(users['keyword_list'], following_list_count),
                                  'following_id': chainer(users['following_list'])})

#### Parameter 설정

# 데이터 전처리

현우님 모델은 크게 2가지로 나뉘는데<br>
1. 2/14 ~ 2/28일간에 읽은 정보를 사용한 모델
 - weekly->series->following_favor->following_favor1->various_user->read_favor->most_read
2. 읽은 정보가 없는 모델
 - weekly->series->following_favor->following_favor1->various_user->most_read

사실 둘이 같이써도 무방할 것 같다. 왜냐하면 어차피 없어서 추천을 못할거니까<br>


### 추천모델에 사용될 추천 frame 생성

In [17]:
dev_following_df = following_rowwise.loc[following_rowwise['user_id'].isin(dev_user['dev']),['user_id','following_id']]
dev_following_df.reset_index(drop=True, inplace=True)

### Parameter

In [18]:
read_check_period = (20190101, 20190301)

weekly_meta_period=(pd.datetime(2017, 7, 14, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
weekly_read_period=(20190207, 20190301)
weekly_series_count = 6

series_meta_period=(pd.datetime(2019, 2, 1, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
series_read_period=(20190207, 20190301)
series_series_count = 7

dont_following_series_meta_period=(pd.datetime(2019, 2, 1, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
dont_following_series_read_period=(20190214, 20190301)
dont_following_series_series_count = 7

dont_following_weekly_meta_period=(pd.datetime(2017, 7, 14, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
dont_following_weekly_read_period=(20190214, 20190301)
dont_following_weekly_series_count = 6

following_meta_period=(pd.datetime(2019, 2, 1, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
following_read_period=(20190207, 20190301)
following_favor_cutoff=0.05

dont_following_meta_period=(pd.datetime(2019, 2, 1, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
dont_following_read_period=(20190214, 20190301) # 구독을 안했기에 최신 선호도를 반영하고 싶음. 
dont_following_favor_cutoff=0.05

variable_user_model_read_period = (20190214, 20190301)

regression_model_meta_period=(pd.datetime(2019, 2, 28, 0, 0), pd.datetime(2019, 3, 15, 0, 0))
regression_model_read_period=(20190214, 20190301)
regresssion_before_meta_period=(pd.datetime(2019, 2, 1, 0, 0), pd.datetime(2019, 2, 28, 0, 0))
regression_before_read_period=(20190201, 20190301)

best_read_period = (20190222, 20190301)
best_correction_read_period = (20190207, 20190301)
time_based_best_period = (20190221, 20190301)

In [19]:
# brunch model
brunch_table = pd.DataFrame()
brunch_notice = ['@brunch_153','@brunch_151']
brunch_table['article_id'] = brunch_notice

# weekly model
weekly_table = weekly_magazine_series(read_rowwise, 
                                      metadata, 
                                      dev_following_df, 
                                      meta_period=weekly_meta_period, 
                                      read_period=weekly_read_period, 
                                      series_count=weekly_series_count) # clear

# series model
series_table = magazine_series(read_rowwise, 
                               metadata, 
                               dev_following_df,
                               meta_period=series_meta_period, 
                               read_period=series_read_period, 
                               series_count=series_series_count) # clear

# dont_following_magazine_series
dont_series_table = dont_following_magazine_series(read_rowwise, 
                                                   metadata, 
                                                   dev_following_df,
                                                   meta_period=dont_following_series_meta_period, 
                                                   read_period=dont_following_series_read_period, 
                                                   series_count=dont_following_series_series_count) 

# dont_following_weekly_series
dont_weekly_table = dont_following_weekly_series(read_rowwise, 
                                                 metadata, 
                                                 dev_following_df,
                                                 meta_period=dont_following_weekly_meta_period, 
                                                 read_period=dont_following_weekly_read_period, 
                                                 series_count=dont_following_weekly_series_count) 

# best model
most_read_article = read_preprocessing(read_rowwise, metadata ,read_period=best_read_period) # 2/22 ~ 2/28

# 가장 많이 읽은 article_id, 마지막 모델에 사용하기 위하여 만듬
most_read_article = most_read_article['article_id'].value_counts().reset_index()
most_read_article.columns = ['article_id','value_counts']

# 구독작가에 맞는 추천을 하기 위하여 사용
# cumcount는 그룹을 나타내주는데 가장 많이 읽은 article부터 작가의 rank를 나타냄
most_read_article['author_id'] = most_read_article['article_id'].apply(lambda x : x.split('_')[0])
most_read_article['rank'] = most_read_article.groupby(['author_id'])['author_id'].agg({'cumcount'}).reset_index(drop=True)
most_read_article_author_rank = most_read_article.copy()

# following model
following_favor_many_read = following_favor_frame(read_rowwise, 
                                                  metadata, 
                                                  dev_following_df, 
                                                  most_read_article_author_rank, 
                                                  correction_type=0,
                                                  meta_period=following_meta_period,
                                                  read_period=following_read_period,
                                                  favor_cutoff=following_favor_cutoff) # clear

# following model
following_favor_repeat_read = following_favor_frame(read_rowwise, 
                                                  metadata, 
                                                  dev_following_df, 
                                                  most_read_article_author_rank, 
                                                  correction_type=1,
                                                  meta_period=following_meta_period,
                                                  read_period=following_read_period,
                                                  favor_cutoff=following_favor_cutoff) # clear

"""
# dont_following_favor_frame
dont_following_favor_each_read = dont_following_favor_frame(read_rowwise, 
                                                  metadata, 
                                                  dev_following_df, 
                                                  most_read_article_author_rank, 
                                                  correction_type=2,
                                                  meta_period=dont_following_meta_period,
                                                  read_period=dont_following_read_period,
                                                  favor_cutoff=dont_following_favor_cutoff)
"""

# variable model
read_temp = read_preprocessing(read_rowwise, metadata , read_period=variable_user_model_read_period)
variable_user = get_how_many_read_by_variableuser_article(read_temp) # clear

# user correction model
dev_user_frame = dev_user.rename(columns={'dev':'user_id'})
read_user_correction, dontread_user_correction  = count_correlction_read_favor(read_rowwise, 
                                                                               metadata, 
                                                                               dev_user_frame,
                                                                               meta_period=following_meta_period,
                                                                               read_period=following_read_period,
                                                                               favor_cutoff=following_favor_cutoff) # clear

# regression march
regression_march_table = regression_march(read_rowwise, metadata, dev_following_df,
                                meta_period = regression_model_meta_period,
                                read_period = regression_model_read_period,
                                regresssion_meta_period = regresssion_before_meta_period,
                                regression_read_period = regression_before_read_period)

# best read
most_read_article_frame = most_read_article.copy() # clear

# read check
read_check_frame = read_preprocessing(read_rowwise, metadata ,read_period=read_check_period)

best_correction_frame = best_correction(read_rowwise, metadata, read_period=best_correction_read_period)
"""
# time based best
timebased_best_user = read_preprocessing(read_rowwise, metadata , read_period=best_read_period)
timebased_best_user = timebased_best_user.loc[timebased_best_user['user_id'].isin(dev_user['dev'].values)].reset_index(drop=True)
timebased_best_time = make_timebased_best(read_rowwise, metadata, read_period=time_based_best_period)
"""

read


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


weekly_meta
magazine_table


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be 

read_user (7815123, 4)
dontread_user (128, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


"\n# time based best\ntimebased_best_user = read_preprocessing(read_rowwise, metadata , read_period=best_read_period)\ntimebased_best_user = timebased_best_user.loc[timebased_best_user['user_id'].isin(dev_user['dev'].values)].reset_index(drop=True)\ntimebased_best_time = make_timebased_best(read_rowwise, metadata, read_period=time_based_best_period)\n"

### Modeling

In [20]:
weekly_model = CutoffRecommend(weekly_table, cutoff_recommend_count=10, userbased_model=True, continous_read=True)
series_model = CutoffRecommend(series_table, cutoff_recommend_count=10, userbased_model=True, continous_read=True)
dont_series_model = CutoffRecommend(dont_series_table, cutoff_recommend_count=10, userbased_model=True, continous_read=True)
dont_weekly_model = CutoffRecommend(dont_weekly_table, cutoff_recommend_count=10, userbased_model=True, continous_read=True)
following_favor_many_read_model = CutoffRecommend(following_favor_many_read, cutoff_recommend_count=26, userbased_model=True)
following_favor_repeat_read_model = CutoffRecommend(following_favor_repeat_read, cutoff_recommend_count=26, userbased_model=True)
# dont_following_favor_each_read_model = CutoffRecommend(dont_following_favor_each_read, cutoff_recommend_count=10, userbased_model=True)
variable_user_model = CutoffRecommend(variable_user, cutoff_recommend_count=4, userbased_model=False)
brunch_model = CutoffRecommend(brunch_table, cutoff_recommend_count=2, userbased_model=False)
regression_user_model = CutoffRecommend(regression_march_table, cutoff_recommend_count=21, userbased_model=True)
correction_favor_model = CutoffRecommend(read_user_correction, cutoff_recommend_count=30, userbased_model=True)
# timebased_best_model = TimebasedRecommend(timebased_best_user, timebased_best_time, cutoff_recommend_count=-1)
most_read_article_model = RandomBestRecommend(best_correction_frame, cutoff_recommend_count=-1)

In [21]:
brunch_recommend_read = BrunchRecommend(read_user_correction['user_id'].unique(), read_check_frame)
read_model_list = [ weekly_model, series_model, dont_series_model, dont_weekly_model, following_favor_many_read_model, 
                   following_favor_repeat_read_model, variable_user_model, brunch_model, regression_user_model, 
                   correction_favor_model, most_read_article_model]
brunch_recommend_read.recommend(read_model_list)

HBox(children=(IntProgress(value=0, max=2872), HTML(value='')))

RandomBestRecommend
62
RandomBestRecommend
18
RandomBestRecommend
58
RandomBestRecommend
64
RandomBestRecommend
0
RandomBestRecommend
62
RandomBestRecommend
55
RandomBestRecommend
45
RandomBestRecommend
38
RandomBestRecommend
65
RandomBestRecommend
61
RandomBestRecommend
61
RandomBestRecommend
45
RandomBestRecommend
0
RandomBestRecommend
50
RandomBestRecommend
0
RandomBestRecommend
55
RandomBestRecommend
65
RandomBestRecommend
6
RandomBestRecommend
47
RandomBestRecommend
43
RandomBestRecommend
81
RandomBestRecommend
64
RandomBestRecommend
55
RandomBestRecommend
58
RandomBestRecommend
64
RandomBestRecommend
59
RandomBestRecommend
34
RandomBestRecommend
64
RandomBestRecommend
65
RandomBestRecommend
56
RandomBestRecommend
65
RandomBestRecommend
47
RandomBestRecommend
26
RandomBestRecommend
62
RandomBestRecommend
65
RandomBestRecommend
48
RandomBestRecommend
70
RandomBestRecommend
25
RandomBestRecommend
62
RandomBestRecommend
10
RandomBestRecommend
0
RandomBestRecommend
43
RandomBestRecomm

RandomBestRecommend
62
RandomBestRecommend
9
RandomBestRecommend
0
RandomBestRecommend
43
RandomBestRecommend
76
RandomBestRecommend
21
RandomBestRecommend
39
RandomBestRecommend
61
RandomBestRecommend
45
RandomBestRecommend
64
RandomBestRecommend
63
RandomBestRecommend
65
RandomBestRecommend
65
RandomBestRecommend
63
RandomBestRecommend
62
RandomBestRecommend
0
RandomBestRecommend
59
RandomBestRecommend
56
RandomBestRecommend
39
RandomBestRecommend
57
RandomBestRecommend
39
RandomBestRecommend
61
RandomBestRecommend
43
RandomBestRecommend
54
RandomBestRecommend
55
RandomBestRecommend
32
RandomBestRecommend
63
RandomBestRecommend
16
RandomBestRecommend
65
RandomBestRecommend
39
RandomBestRecommend
63
RandomBestRecommend
42
RandomBestRecommend
35
RandomBestRecommend
0
RandomBestRecommend
44
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
15
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
65
RandomBestRecommend
54
RandomBestRecommend
16
RandomBestRecommen

RandomBestRecommend
24
RandomBestRecommend
63
RandomBestRecommend
65
RandomBestRecommend
36
RandomBestRecommend
48
RandomBestRecommend
64
RandomBestRecommend
65
RandomBestRecommend
26
RandomBestRecommend
53
RandomBestRecommend
63
RandomBestRecommend
0
RandomBestRecommend
51
RandomBestRecommend
34
RandomBestRecommend
58
RandomBestRecommend
56
RandomBestRecommend
59
RandomBestRecommend
0
RandomBestRecommend
64
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
0
RandomBestRecommend
32
RandomBestRecommend
61
RandomBestRecommend
86
RandomBestRecommend
54
RandomBestRecommend
48
RandomBestRecommend
47
RandomBestRecommend
65
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
64
RandomBestRecommend
44
RandomBestRecommend
27
RandomBestRecommend
37
RandomBestRecommend
46
RandomBestRecommend
65
RandomBestRecommend
33
RandomBestRecommend
65
RandomBestRecommend
0
RandomBestRecommend
51
RandomBestRecommend
52
RandomBestRecommend
31
RandomBestRecommend
59
RandomBestRecomme

RandomBestRecommend
64
RandomBestRecommend
8
RandomBestRecommend
0
RandomBestRecommend
1
RandomBestRecommend
15
RandomBestRecommend
63
RandomBestRecommend
60
RandomBestRecommend
46
RandomBestRecommend
62
RandomBestRecommend
2
RandomBestRecommend
57
RandomBestRecommend
59
RandomBestRecommend
42
RandomBestRecommend
10
RandomBestRecommend
61
RandomBestRecommend
0
RandomBestRecommend
62
RandomBestRecommend
59
RandomBestRecommend
50
RandomBestRecommend
18
RandomBestRecommend
0
RandomBestRecommend
21
RandomBestRecommend
15
RandomBestRecommend
65
RandomBestRecommend
39
RandomBestRecommend
59
RandomBestRecommend
51
RandomBestRecommend
65
RandomBestRecommend
42
RandomBestRecommend
46
RandomBestRecommend
63
RandomBestRecommend
64
RandomBestRecommend
39
RandomBestRecommend
65
RandomBestRecommend
57
RandomBestRecommend
46
RandomBestRecommend
23
RandomBestRecommend
65
RandomBestRecommend
35
RandomBestRecommend
61
RandomBestRecommend
12
RandomBestRecommend
64
RandomBestRecommend
85
RandomBestRecomme

RandomBestRecommend
0
RandomBestRecommend
24
RandomBestRecommend
64
RandomBestRecommend
65
RandomBestRecommend
13
RandomBestRecommend
29
RandomBestRecommend
0
RandomBestRecommend
52
RandomBestRecommend
31
RandomBestRecommend
0
RandomBestRecommend
62
RandomBestRecommend
0
RandomBestRecommend
74
RandomBestRecommend
65
RandomBestRecommend
0
RandomBestRecommend
34
RandomBestRecommend
56
RandomBestRecommend
63
RandomBestRecommend
61
RandomBestRecommend
64
RandomBestRecommend
61
RandomBestRecommend
0
RandomBestRecommend
15
RandomBestRecommend
65
RandomBestRecommend
63
RandomBestRecommend
64
RandomBestRecommend
63
RandomBestRecommend
0
RandomBestRecommend
61
RandomBestRecommend
23
RandomBestRecommend
62
RandomBestRecommend
34
RandomBestRecommend
40
RandomBestRecommend
0
RandomBestRecommend
45
RandomBestRecommend
50
RandomBestRecommend
64
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
46
RandomBestRecommend
64
RandomBestRecommend
50
RandomBestRecommend
15
RandomBestRecommend


RandomBestRecommend
65
RandomBestRecommend
59
RandomBestRecommend
37
RandomBestRecommend
52
RandomBestRecommend
40
RandomBestRecommend
89
RandomBestRecommend
65
RandomBestRecommend
57
RandomBestRecommend
0
RandomBestRecommend
58
RandomBestRecommend
65
RandomBestRecommend
63
RandomBestRecommend
64
RandomBestRecommend
59
RandomBestRecommend
38
RandomBestRecommend
47
RandomBestRecommend
63
RandomBestRecommend
65
RandomBestRecommend
9
RandomBestRecommend
43
RandomBestRecommend
63
RandomBestRecommend
12
RandomBestRecommend
65
RandomBestRecommend
59
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
5
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
31
RandomBestRecommend
65
RandomBestRecommend
63
RandomBestRecommend
9
RandomBestRecommend
60
RandomBestRecommend
5
RandomBestRecommend
63
RandomBestRecommend
23
RandomBestRecommend
65
RandomBestRecommend
50
RandomBestRecommend
60
RandomBestRecommend
61
RandomBestRecommend
79
RandomBestRecommend
0
RandomBestRecommend


RandomBestRecommend
46
RandomBestRecommend
0
RandomBestRecommend
44
RandomBestRecommend
48
RandomBestRecommend
0
RandomBestRecommend
63
RandomBestRecommend
55
RandomBestRecommend
65
RandomBestRecommend
62
RandomBestRecommend
24
RandomBestRecommend
64
RandomBestRecommend
0
RandomBestRecommend
55
RandomBestRecommend
41
RandomBestRecommend
65
RandomBestRecommend
59
RandomBestRecommend
22
RandomBestRecommend
56
RandomBestRecommend
39
RandomBestRecommend
59
RandomBestRecommend
62
RandomBestRecommend
52
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
23
RandomBestRecommend
65
RandomBestRecommend
30
RandomBestRecommend
19
RandomBestRecommend
64
RandomBestRecommend
11
RandomBestRecommend
60
RandomBestRecommend
60
RandomBestRecommend
42
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
48
RandomBestRecommend
3
RandomBestRecommend
58
RandomBestRecommend
2
RandomBestRecommend
0
RandomBestRecommend
55
RandomBestRecommend
47


RandomBestRecommend
48
RandomBestRecommend
63
RandomBestRecommend
16
RandomBestRecommend
61
RandomBestRecommend
52
RandomBestRecommend
63
RandomBestRecommend
43
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
65
RandomBestRecommend
65
RandomBestRecommend
17
RandomBestRecommend
7
RandomBestRecommend
16
RandomBestRecommend
58
RandomBestRecommend
42
RandomBestRecommend
17
RandomBestRecommend
50
RandomBestRecommend
59
RandomBestRecommend
62
RandomBestRecommend
65
RandomBestRecommend
65
RandomBestRecommend
53
RandomBestRecommend
0
RandomBestRecommend
0
RandomBestRecommend
65
RandomBestRecommend
42
RandomBestRecommend
0
RandomBestRecommend
41
RandomBestRecommend
42
RandomBestRecommend
59
RandomBestRecommend
59
RandomBestRecommend
63
RandomBestRecommend
61
RandomBestRecommend
64
RandomBestRecommend
53
RandomBestRecommend
63
RandomBestRecommend
65
RandomBestRecommend
65
RandomBestRecommend
35
RandomBestRecommend
0
RandomBestRecommend
59
RandomBestRecommend
0
RandomBestRecommen

In [22]:
brunch_recommend_dontread = BrunchRecommend(dontread_user_correction['user_id'].unique(), 
                                            read_check_frame, 
                                            brunch_recommend_read.all_read_set)
variable_user_model = CutoffRecommend(variable_user, cutoff_recommend_count=9, userbased_model=False)
read_model_list = [weekly_model, series_model, dont_series_model, dont_weekly_model, following_favor_many_read_model, 
                   following_favor_repeat_read_model, brunch_model, variable_user_model, regression_user_model, 
                   most_read_article_model]
brunch_recommend_dontread.recommend(read_model_list)

HBox(children=(IntProgress(value=0, max=128), HTML(value='')))

RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestRecommend
89
RandomBestR

In [23]:
data_frame_up = brunch_recommend_read.make_result_frame()
data_frame_down = brunch_recommend_dontread.make_result_frame()
data_frame = pd.concat([data_frame_up,data_frame_down],axis=0)
data_frame.shape

(3000, 101)

In [24]:
dev_user_frame = dev_user.rename(columns={'dev':'user_id'})
sub = dev_user_frame.merge(data_frame, on='user_id', how='left')
sub.to_csv('./submission/recommend_2.txt',index=False,header=False,sep=' ')

### Test

In [None]:
data_frame_up = brunch_recommend_read.make_result_frame()

In [None]:
h_best = pd.read_csv('./submission/recommend.txt',header=None)
h_best = h_best.rename(columns={0:'user_id'})

In [None]:
h_best[h_best['user_id'].apply(lambda x: len(set(x.split(' ')))) != 101]

In [2]:
a = os.listdir('res/read/')

In [31]:
b = os.listdir('res/read/')

In [3]:
a

['2018112114_2018112115',
 '2019021003_2019021004',
 '2019011001_2019011002',
 '2019022219_2019022220',
 '2018110409_2018110410',
 '2019010610_2019010611',
 '2019010921_2019010922',
 '2018101217_2018101218',
 '2019021112_2019021113',
 '2018111807_2018111808',
 '2019011718_2019011719',
 '2018122417_2018122418',
 '2019022320_2019022321',
 '2019021619_2019021620',
 '2019021811_2019021812',
 '2019011813_2019011814',
 '2018122904_2018122905',
 '2019022403_2019022404',
 '2019012401_2019012402',
 '2018111514_2018111515',
 '2018100915_2018100916',
 '2018113009_2018113010',
 '2018110305_2018110306',
 '2018102617_2018102618',
 '2019022512_2019022513',
 '2019011902_2019011903',
 '2018121017_2018121018',
 '2019012318_2019012319',
 '2018120216_2018120217',
 '2019021720_2019021721',
 '2018110910_2018110911',
 '2019020415_2019020416',
 '2018102903_2018102904',
 '2018101511_2018101512',
 '2018100300_2018100301',
 '2018120323_2018120400',
 '2018122311_2018122312',
 '2018112313_2018112314',
 '2019021204

In [33]:
b

['2018112114_2018112115',
 '2019021003_2019021004',
 '2019011001_2019011002',
 '2019022219_2019022220',
 '2018110409_2018110410',
 '2019010610_2019010611',
 '2019010921_2019010922',
 '2018101217_2018101218',
 '2019021112_2019021113',
 '2018111807_2018111808',
 '2019011718_2019011719',
 '2018122417_2018122418',
 '2019022320_2019022321',
 '2019021619_2019021620',
 '2019021811_2019021812',
 '2019011813_2019011814',
 '2018122904_2018122905',
 '2019022403_2019022404',
 '2019012401_2019012402',
 '2018111514_2018111515',
 '2018100915_2018100916',
 '2018113009_2018113010',
 '2018110305_2018110306',
 '2018102617_2018102618',
 '2019022512_2019022513',
 '2019011902_2019011903',
 '2018121017_2018121018',
 '2019012318_2019012319',
 '2018120216_2018120217',
 '2019021720_2019021721',
 '2018110910_2018110911',
 '2019020415_2019020416',
 '2018102903_2018102904',
 '2018101511_2018101512',
 '2018100300_2018100301',
 '2018120323_2018120400',
 '2018122311_2018122312',
 '2018112313_2018112314',
 '2019021204

In [34]:
a == b

True