# 深度學習第一次競賽報告 - 第12組 工具人智慧
組員：
<br/>107024501 高瑀鍹
<br/>107024506 王子誠
<br/>107024511 羅揚
<br/>107024522 戴子翔

In [None]:
# Loading Package
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, hstack
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import string

import gc
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stop = stopwords.words('english')

%matplotlib inline

import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import accuracy_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import RidgeClassifierCV, LogisticRegression, RidgeClassifier
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm
import lightgbm as lgb
from  xgboost import XGBClassifier
import os

In [None]:
def tokenizer(text):
    return re.split('\s+', text.strip())

def tokenizer_stem(text):
    # current
    
    porter = PorterStemmer()
    
    # '[a-zA-Z]+' 至少有一個英文字母
    return [porter.stem(word) for word in re.split('\s+', text.strip()) if re.match('[a-zA-Z]+', word)]

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def preprocessor_ta(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [None]:
df_train = pd.read_csv("./data/original/train.csv")
df_train.drop(["Id"],axis=1,inplace=True)

df_test = pd.read_csv("./data/original/test.csv")
df_test.drop(["Id"],axis=1,inplace=True)

In [None]:
df_temp = pd.read_csv('./data/original/train.csv')
df_test_temp = pd.read_csv('./data/original/test.csv')

X_train = df_temp['Page content']
y_train = df_temp['Popularity']
X_test = df_test_temp['Page content']

# Proprocessing

## Cleaning Text Data

首先，我們先將文字分成標題 (title)、內容 (content)、主題 (topics)，因為三者在文字中的重要性不同，因此先將三者分開。
<br/> 獲得文字後，我將文字從html格式取出、換成小寫、去除標點符號與stopword。此外，我發現有部分文章中會出現超連結，這些超連結轉為文字資料後意義不大，因此我也將這些連結移除。

根據觀察，部分文章的內容經過前處理後，會出現如script、js、id、src等無意義詞彙，這是因為文字內出現以&lt;script&gt;為開頭的javascript程式碼，但此程式碼並不會被BeautifulSoup刪除，因此必須額外處理。
<br/> 有趣的是，我發現開頭為&lt;script ...&gt;的javascript程式碼會被BeautifulSoup刪除，因此不須額外處理。

In [None]:
def getPoisition(text,pattern):
    output=[]
    p = re.compile(pattern)
    temp = list()
    for m in p.finditer(text):
        temp.append(m.start())
        
    return temp

def get_article_content(text):
    tpoi = getPoisition(text, '<section class="')
    text = text[tpoi[0]+len('<section class="'):]
    
    tpoi = getPoisition(text, '"')
    text = text[tpoi[0]+2:]
    
    tpoi = getPoisition(text,"</article>")

    return text[:tpoi[0]]

def get_title(text):
    tpoi = getPoisition(text, '<h1 class="title">')
    text = text[tpoi[0]+len('<h1 class="title">'):]
    
    tpoi = getPoisition(text, '</h1>')
    text = text[:tpoi[0]]
    
    return text

def get_foot_topic(text, isN=False):
    tpoi = getPoisition(text, '<footer class="article-topics">')
    tpoi2 = getPoisition(text, '</footer>')

    footer_text = text[tpoi[0]+len('<footer class="article-topics">'):tpoi2[0]]

    prefix = getPoisition(footer_text, '/">')
    suffix = getPoisition(footer_text, '</a>')

    output = []
    for i in range(len(prefix)):
        output.append(footer_text[prefix[i]+3:suffix[i]])
        
    if isN:
        return len(output)
    else:
        return " ".join(output)

In [None]:
def remove_url(text):
    tpoi_http = getPoisition(text,"http:")
    tpoi_https = getPoisition(text,"https:")

    url_all = tpoi_http+tpoi_https
    url_all.sort()

    url_space_all = [0]

    for url_poi in url_all:
        tpoi = getPoisition(text[url_poi:]," ")
        url_space_all.append(url_poi+tpoi[0])

    url_all.append(len(text))

    output = []
    for i in range(len(url_space_all)):
        sub_text = text[url_space_all[i]:url_all[i]]
        output.append(sub_text)
        
    return " ".join(output)

def preprocessing(text,isbs4=False):
    if isbs4:
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = remove_url(text)
    
    text = re.sub('[^\w\s]', ' ', text)
    text = re.sub('(?::|;|=|X)(?:-)?(?:\)|\(|D|P)', ' ', text)
    text = text.lower()
    text = re.sub('\\s+', ' ', text)
    text = " ".join([x for x in text.split() if x not in stop])
    text = " ".join([x for x in text.split() if not x.isdigit()])

    letters = [x for x in string.ascii_lowercase]
    text = " ".join([x for x in text.split() if x not in letters])
    
    return text

In [None]:
def remove_js(text):
    tpoi = getPoisition(text,'<script>')
    
    if len(tpoi)==0:
        return text
    
    sFinish = [0]
    for i in tpoi:
        subpoi = getPoisition(text[i:],'</script>')
        sFinish.append(i+subpoi[0]+len("</script>"))

    tpoi.append(len(text))

    output = []
    for i in range(len(tpoi)):
        sub_text = text[sFinish[i]:tpoi[i]]
        output.append(sub_text)

    return " ".join(output)

## Feature Engineering

我接著在文字資料上定義了許多不同的Feature。

### data channel、data type、日期
我定義了data channel，因為這表示文章在網站中被分類的主題，能夠代表文章的方向。我也定義了article type，這能夠代表文章的類型。
<br/>我也有將時間抓下，用以進行之後的處理。

In [None]:
def get_data_channel(text):
    dc_poi = getPoisition(text, "data-channel")
    text = text[dc_poi[0]:]
    
    g_poi = getPoisition(text, '"')
    output = text[g_poi[0]+1:g_poi[1]]
    
    return output

def get_article_type(text):
    tpoi = getPoisition(text, '<section class="')
    text = text[tpoi[0]+len('<section class="'):]
    
    tpoi = getPoisition(text, '"')
    
    return text[:tpoi[0]]

def get_date(text):
    tpoi = getPoisition(text, '<time datetime=')
    
    if len(tpoi)==0:
        return np.nan
    
    text = text[tpoi[0]+49:]

    tpoi = getPoisition(text, ' ')
    text = text[:tpoi[1]]
    
    return text

### html架構
我也有針對html檔的架構定義Feature，包含p、div、h1、h2、js，因為這些東西可能涵蓋不同的段落、小標等資訊。若一個版面段落沒有分好，會影響到讀者看這篇文章的心情，因此我們加入一些和文章結構有關的變數。

In [None]:
def n_p(text):
    tpoi = getPoisition(text,"<p>")
    return len(tpoi)

def n_div(text):
    tpoi = getPoisition(text,"<div")
    return len(tpoi)

def n_h1(text):
    tpoi = getPoisition(text,"<h1")
    return len(tpoi)

def n_h2(text):
    tpoi = getPoisition(text,"<h2")
    return len(tpoi)

def n_js(text):
    tpoi = getPoisition(text,'</script>')
    return len(tpoi)

### 文章內容中的資訊
我也定義了一些可能與文章內容有關的Feature，如圖片、youtube、twitter、社群網路、連結、see also個數。我認為這些Feature可能能夠包含一些文章的內容架構。
<br/>此外，我在處理主題文字時，我發現有部分主題可能包含不只一個字，因此我認為將主題個數納入考量也是重要的。

In [None]:
def n_link(text):
    htt = getPoisition(text,'href="http:')
    htts = getPoisition(text,'href="https:')
    
    return len(htts)+len(htt)

def n_image(text,isClass=True):
    if isClass:
        tpoi = getPoisition(text, 'class="image"')
        
        return len(tpoi)
    else:    
        tpoi_jpg = getPoisition(text, '.jpg')
        tpoi_png = getPoisition(text, '.png')
        tpoi_jpeg = getPoisition(text, '.jpeg')

        return len(tpoi_jpg)+len(tpoi_png)+len(tpoi_jpeg)

def n_youtube(text):
    # video
    tpoi = getPoisition(text, 'www.youtube.com')
    return len(tpoi)

def n_twitter(text):
    # tweets
    tpoi = getPoisition(text,'class="twitter-tweet')
    return len(tpoi)

def n_social_media(text):
    # all kind of data
    tpoi_yt = getPoisition(text, 'youtube.com')
    tpoi_twitter =getPoisition(text,'twitter.com')
    tpoi_fb =getPoisition(text,'facebook.com')
    
    return len(tpoi_yt)+len(tpoi_twitter)+len(tpoi_fb)

def n_see_also(text):
    tpoi_upper = getPoisition(text,"SEE ALSO")
    tpoi_lower = getPoisition(text,"see also")
    tpoi_mix1 = getPoisition(text,"See also")
    tpoi_mix2 = getPoisition(text,"See Also")
    
    return len(tpoi_upper)+len(tpoi_lower)+len(tpoi_mix1)+len(tpoi_mix2)

def out_url(text,rType):  
    tpoi_start = getPoisition(text,"<ul>")
    tpoi_end = getPoisition(text,"</ul>")
    
    if rType=="n_ul":
        return len(tpoi_start)
    
    elif rType == "n_link":
        total_url = 0
        
        for i in range(len(tpoi_start)):
            total_url += n_link(text[tpoi_start[i]:tpoi_end[i]])
    
        return total_url
    
    elif rType=="n_token":
        return 0

In [None]:
for df in [df_train,df_test]:
    df["content"] = df["Page content"].apply(lambda x:preprocessing(remove_js(get_article_content(x)),isbs4=True))
    df["title"] = df["Page content"].apply(lambda x:preprocessing(get_title(x),isbs4=False))
    df["topics"] = df["Page content"].apply(lambda x:preprocessing(get_foot_topic(x,isN=False),isbs4=False))

    df["n_topics"] = df["Page content"].apply(lambda x:get_foot_topic(x,isN=True))

    df["data_channel"] = df["Page content"].apply(lambda x:get_data_channel(x))
    df["article_type"] = df["Page content"].apply(lambda x:get_article_type(x))

    df["datetime"] = df["Page content"].apply(lambda x:get_date(x))
    df["n_link_all"]=df["Page content"].apply(lambda x:n_link(remove_js(x)))
    df["n_link_content"]=df["Page content"].apply(lambda x:n_link(remove_js(get_article_content(x))))
    df["n_js_all"]=df["Page content"].apply(lambda x:n_js(x))
    df["n_js_content"]=df["Page content"].apply(lambda x:n_js(get_article_content(x)))

    df["n_image"] = df["Page content"].apply(lambda x:n_image(x,isClass=True))
    df["n_youtube"] = df["Page content"].apply(lambda x:n_youtube(x))
    df["n_twitter"] = df["Page content"].apply(lambda x:n_twitter(x))
    df["n_social_media"] = df["Page content"].apply(lambda x:n_social_media(x))

    df["n_p"] = df["Page content"].apply(lambda x:n_p(get_article_content(x)))
    df["n_div"] = df["Page content"].apply(lambda x:n_div(get_article_content(x)))
    df["n_h1"] = df["Page content"].apply(lambda x:n_h1(get_article_content(x)))
    df["n_h2"] = df["Page content"].apply(lambda x:n_h2(get_article_content(x)))

    df["n_see_also"] = df["Page content"].apply(lambda x:n_see_also(x))
    df["n_ul"] = df["Page content"].apply(lambda x:out_url(get_article_content(x),rType="n_ul"))
    df["n_ul_link"] = df["Page content"].apply(lambda x:out_url(get_article_content(x),rType="n_link"))

    df.drop(["Page content"],axis=1,inplace=True)

### 標題、內容、標題+內容的字數
我首先定義了標題、內容、標題+內容的長度、字數、每個字的長度，我認為前兩者可以代表文章本身的長短，每個字的長度則可測量作者的寫作能力。我接著定義了標題與標題+內容在長度、字數、每個字的長度上的比例，用以標示標題在整個頁面中的分量。
<br/>另外，我也定義了內容中是否有出現breaking news的字樣，因為breaking news通常會是重點新聞，可能會影響文章熱度。
<br/>我接著定義了標題字數、長度以及每個標題的平均字數與長度。這是用來標示標題的文字多寡，我認為者可能與作者是否花心思寫有關，進而影響到人氣。

In [None]:
def is_breaking_news(text):
    if "breaking news" in text:
        return 1
    else:
        return 0

for df in [df_train, df_test]:
    df["title_len"] = df["title"].apply(lambda x: len(x))
    df["title_n_item"] = df["title"].apply(lambda x: len(tokenizer(x)))
    df["title_avg_len"] = df["title_len"]/df["title_n_item"]

    df["content_len"] = df["content"].apply(lambda x: len(x))
    df["content_n_item"] = df["content"].apply(lambda x: len(tokenizer(x)))
    df["content_avg_len"] = df["content_len"]/df["content_n_item"]

    df["page_len"] = df["title_len"] +df["content_len"]
    df["page_n_item"] = df["title_n_item"]+df["content_n_item"]
    df["page_avg_len"] = df["page_len"]/df["page_n_item"]

    df["title_len_ratio"] = df["page_len"]/df["title_len"]
    df["title_n_item_ratio"] = df["page_n_item"]/df["title_n_item"]
    df["title_avg_len_ratio"] = df["page_avg_len"]/df["title_avg_len"]

    df["topics_n_word"] = df["topics"].apply(lambda x: len(x.split()))
    df["topics_len"] = df["topics"].apply(lambda x: len(x.replace(" ","")))

    df["topics_avg_word"] = df["topics_n_word"]/df["n_topics"]
    df["topics_avg_len"] = df["topics_len"]/df["n_topics"]

    df["topics_avg_word"].fillna(0,inplace=True)
    df["topics_avg_len"].fillna(0,inplace=True)

    df["is_breaking_news"] = df["content"].apply(lambda x: is_breaking_news(x))
    df["link_per_ul"] = df["n_ul_link"]/df["n_ul"]
    df["link_per_ul"].fillna(0,inplace=True)

In [None]:
## 計算文章字數、標題字數等資訊
## 不同組員的定義程式碼

word = []; word_test = []
for i in range(len(X_train)): 
    soup = BeautifulSoup(X_train.iloc[i],'html.parser')
    word.append(len(str(soup.prettify)))
    
for i in range(len(X_test)): 
    soup = BeautifulSoup(X_test.iloc[i],'html.parser')
    word_test.append(len(str(soup.prettify)))
    
title = []; title_test = []
for i in range(len(X_train)) : 
    soup = BeautifulSoup(X_train.iloc[i],'html.parser')
    title.append(soup.h1.string)
    
for i in range(len(X_test)) : 
    soup = BeautifulSoup(X_test.iloc[i],'html.parser')
    title_test.append(soup.h1.string)
title1 = []; title1_test = []
for i in range(len(X_train)):
    title1.append(len(tokenizer_stem_nostop(preprocessor_ta(title[i]))))
for i in range(len(X_test)):
    title1_test.append(len(tokenizer_stem_nostop(preprocessor_ta(title_test[i]))))
    
link = []; link_test = []; p = []; p_test = []; video = []; video_test = []
for i in range(len(X_train)) : 
    soup = BeautifulSoup(X_train.iloc[i],'html.parser')
    link.append(len(soup.article.find_all(['a'])))
    p.append(len(soup.article.find_all(['p'])))
    video.append(len(soup.find_all("iframe")))
    
for i in range(len(X_test)) : 
    soup = BeautifulSoup(X_test.iloc[i],'html.parser')
    link_test.append(len(soup.article.find_all(['a'])))
    p_test.append(len(soup.article.find_all(['p'])))
    video_test.append(len(soup.find_all("iframe")))
    
X = pd.concat([X_train,X_test],axis = 0)
Article = []
for i in range(len(X)) : 
    soup = BeautifulSoup(X.iloc[i],'html.parser')
    Article.append(soup.article.attrs['data-channel'])
Article = pd.get_dummies(Article)
article = Article.iloc[range(len(X_train)),:]
article_test = Article.iloc[range(len(X_train),len(X),1),:]

### 時間相關變數
對於時間方面，我找出文章的年、月、日、星期幾、是否為週末、小時、是否為早午晚凌晨、文章發布日至指定日期的天數。
<br/>因為文章所發布的熱度很有可能與時間背景或是不同時間的上網人數有關，因此時間變數可能影響人氣。
<br/>此外，因為testing data中存在一筆時間為missing，因此我將文章發布日至指定日期的天數使用平均值補值，並以此推算年、月、日、星期幾、是否為週末。小時等資訊則用平均值補值。

In [None]:
def cal_back_up_to_now(df,index_m):
    df.loc[index_m,"up_to_now"] = df.loc[~index_m,"up_to_now"].mean().round()
    df.loc[index_m,"hour"] = df.loc[~index_m,"hour"].mean().round()
    df.loc[index_m,"datetime"] =  datetime.datetime(2019,10,18)-datetime.timedelta(days=int(df.loc[index_m,"up_to_now"]))
    
    df.loc[index_m,"year"] = df.loc[index_m,"datetime"].dt.year
    df.loc[index_m,"month"] = df.loc[index_m,"datetime"].dt.month
    df.loc[index_m,"day"] = df.loc[index_m,"datetime"].dt.day
    df.loc[index_m,"dayofweek"] = df.loc[index_m,"datetime"].dt.dayofweek+1
    df.loc[index_m,"weekend"]=(df.loc[index_m,"dayofweek"]>=6).astype(int)
    
    df.loc[index_m,"is_night"] = (df.loc[index_m,'hour']<=6).astype(int)
    df.loc[index_m,"is_morning"] = ((df.loc[index_m,'hour']>6) & (df.loc[index_m,'hour']<=12)).astype(int)
    df.loc[index_m,"is_afternoon"] = ((df.loc[index_m,'hour']>12) & (df.loc[index_m,'hour']<=18)).astype(int)
    df.loc[index_m,"is_evening"] = (df.loc[index_m,'hour']>18).astype(int)
    
    return df

is_train = True
for df in [df_train, df_test]:
    if is_train:
        df["datetime"]= pd.to_datetime(df["datetime"])
    else:
        dt_missing = df["datetime"].isna()
        df["datetime"]= pd.to_datetime(df["datetime"])
        df["datetime"] = df["datetime"].fillna(datetime.datetime(2018,12,31))

    df["year"] = df["datetime"].dt.year
    df["month"] = df["datetime"].dt.month
    df["day"] = df["datetime"].dt.day
    df["dayofweek"] = df["datetime"].dt.dayofweek+1
    df["weekend"]=(df["dayofweek"]>=6).astype(int)

    df['hour'] = df["datetime"].dt.hour

    df["is_night"] = (df['hour']<=6).astype(int)
    df["is_morning"] = ((df['hour']>6) & (df['hour']<=12)).astype(int)
    df["is_afternoon"] = ((df['hour']>12) & (df['hour']<=18)).astype(int)
    df["is_evening"] = (df['hour']>18).astype(int)

    df["up_to_now"]=(datetime.datetime(2019,10,18)-df["datetime"]).dt.days

    if not is_train:
        df = cal_back_up_to_now(df,dt_missing)
        
    is_train = False

### 文章架構有關變數及交互作用項
我接著定義了許多跟文章架構有關的變數以及交互作用項。
<br/>如平均每段的字數、平均每段的社群媒體、h1標題的比例、h2標題的比例，我認為這些都可能影響到讀者對於文章架構的觀感，進而影響人氣。我也定義了文章長度與段落數、社群媒體數與段落數的交互作用，因為我認為不同的段落數下，文章的長短、社群媒體數的影響可能不同。
<br/>我額外定義一個時間變數為是否為工作時間，並計算是否為工作時間與星期的交互作用，小時與星期、是否為工作時間的交互作用，因為我認為不同的星期在不同的時間行為應該不同。

In [None]:
for df in [df_train, df_test]:
    df["content_per_p"] =df["content_len"] / df["n_p"]
    df["n_h"] = df["n_h1"]+df["n_h2"]
    df["n_h1_ratio"] = df["n_h1"]/df["n_h"]
    df["n_h2_ratio"] = df["n_h2"]/df["n_h"]

    df["n_h1_ratio"].fillna(0,inplace=True)
    df["n_h2_ratio"].fillna(0,inplace=True)

    df["is_working"] = ((df['hour']>=10) & (df['hour']<=18)).astype(int)
    df["is_working_dayofweek"] = df["is_working"]*df["dayofweek"]
    df["hour_dayofweek"] = df["hour"]*df["dayofweek"]
    df["is_working_hour"] = df["hour"]*df["is_working"]

    df["content_per_p"] = df["content_len"]/df["n_p"]
    df["content_p_inter"] = df["content_len"]*df["n_p"]

    df["p_per_social_media"] = df["n_social_media"]/df["n_p"]
    df["p_inter_social_media"] = df["n_social_media"]*df["n_p"]

    df["content_social_inter"] = df["n_social_media"]/(df["content_len"]+1)

### data channel的轉換
我發現有些data channel的數量十分稀少，這些data channel應該可以透過適當的轉換與其他合併。
<br/>有些data channel如bus、mob、howto、socmed可以從字面上判斷出其為某些data channel的所寫，因此可以很簡單的找到其應該合併的對象。
<br/>至於其他data channel則是我透過一篇一篇慢慢檢視來找出其可能轉換的對象。

In [None]:
for df in [df_train, df_test]:
    small_set = set(["comics","memes","sports","jobs","home","viral","travel-leisure","conversations"])
    df["data_channel_01"] = df["data_channel"].apply(lambda x: 1 if x in small_set else 0)

    df["data_channel"] = df["data_channel"].replace({"howto":"how-to",
                                                     "mob":"mobile",
                                                     "bus":"business",
                                                     "socmed":"social-media",
                                                     "comics":"lifestyle",
                                                     "memes":"lifestyle",
                                                     "sports":"lifestyle",
                                                     "jobs":"business",
                                                     "home":"lifestyle",
                                                     "viral":"entertainment",
                                                     "travel-leisure":"lifestyle",
                                                     "conversations":"lifestyle"
                                                    })

###  內容與標題的關係
<br/>我額外嘗試了其他的內容與標題的關係，包含兩者在長度、字數、平均長度上的比例，試圖捕捉兩者在呈現上的關係。

In [None]:
df_train_6 = df_train.copy()
df_test_6 = df_test.copy()

for df in [df_train_6, df_test_6]:
    df["content_title_len_ratio"] = df["content_len"]/df["title_len"]
    df["content_title_n_item_ratio"] = df["content_n_item"]/df["title_n_item"]
    df["content_title_avg_len_ratio"] = df["content_avg_len"]/df["title_avg_len"]

### 節日
因為在節日的時候有可能會出現許多節日相關新聞，使其人氣較高，因此我定義出了Feature表示是否為美國的節日，其中再細分出帶薪休假。

In [None]:
chirstmas = set([datetime.date(2013,12,25),datetime.date(2014,12,25),datetime.date(2015,12,25)])
nyEve = set([datetime.date(2013,12,31),datetime.date(2014,12,31),datetime.date(2015,12,31)])
thanksgiving = set([datetime.date(2013,11,28),datetime.date(2014,11,27),datetime.date(2015,11,26)])
ny = set([datetime.date(2013,1,1),datetime.date(2014,1,1),datetime.date(2015,1,1)])
motherDay = set([datetime.date(2013,5,12),datetime.date(2014,5,11),datetime.date(2015,5,10)])
easter = set([datetime.date(2013,3,31),datetime.date(2014,4,20),datetime.date(2015,4,5)])
independence = set([datetime.date(2013,7,4),datetime.date(2014,7,4),datetime.date(2015,7,4)])
fatherDay = set([datetime.date(2013,6,16),datetime.date(2014,6,15),datetime.date(2015,6,21)])
howlloween = set([datetime.date(2013,10,31),datetime.date(2014,10,31),datetime.date(2015,10,31)])
valentine = set([datetime.date(2013,2,14),datetime.date(2014,2,14),datetime.date(2015,2,14)])
saintPatrick = set([datetime.date(2013,3,17),datetime.date(2014,3,17),datetime.date(2015,3,17)])
memorial = set([datetime.date(2013,5,27),datetime.date(2014,5,26),datetime.date(2015,5,25)])
laborDay = set([datetime.date(2013,9,2),datetime.date(2014,9,1),datetime.date(2015,9,7)])

paid_day = chirstmas.union(ny,memorial,independence,laborDay,thanksgiving,chirstmas)
all_holiday = chirstmas.union(nyEve,thanksgiving,ny,motherDay,easter,independence,fatherDay,
                              howlloween,valentine,saintPatrick,memorial,laborDay)

for df in [df_train_6, df_test_6]:
    df["chirstmas"] = df["datetime"].dt.date.apply(lambda x: x in chirstmas).astype(int)
    df["thanksgiving"] = df["datetime"].dt.date.apply(lambda x: x in thanksgiving).astype(int)
    df["motherDay"] = df["datetime"].dt.date.apply(lambda x: x in motherDay).astype(int)
    df["easter"] = df["datetime"].dt.date.apply(lambda x: x in easter).astype(int)
    df["independence"] = df["datetime"].dt.date.apply(lambda x: x in independence).astype(int)
    df["fatherDay"] = df["datetime"].dt.date.apply(lambda x: x in fatherDay).astype(int)
    df["howlloween"] = df["datetime"].dt.date.apply(lambda x: x in howlloween).astype(int)
    df["valentine"] = df["datetime"].dt.date.apply(lambda x: x in valentine).astype(int)
    df["saintPatrick"] = df["datetime"].dt.date.apply(lambda x: x in saintPatrick).astype(int)
    df["memorial"] = df["datetime"].dt.date.apply(lambda x: x in memorial).astype(int)
    df["laborDay"] = df["datetime"].dt.date.apply(lambda x: x in laborDay).astype(int)

    df["paid_day"] = df["datetime"].dt.date.apply(lambda x: x in paid_day).astype(int)
    df["all_holiday"] = df["datetime"].dt.date.apply(lambda x: x in all_holiday).astype(int)

### 文章時間至年底的間隔
最後，我再定義了一個變數表示文章的時間至年底的間隔，因為我觀察到文章人氣的高低很有可能跟文章發表的日期是在一年中的哪天有關，因此加此變數捕捉此效果。

In [None]:
df_train_8 = df_train_6.copy()
df_test_8 = df_test_6.copy()

for df in [df_train_8, df_test_8]:
    df["up_to_EoY"] = (df["year"].apply(lambda x: datetime.datetime(x,12,31,23,59))-df["datetime"]).dt.days
    
for df in [df_train, df_test,
           df_train_6, df_test_6,
           df_train_8, df_test_8]:
    df.drop(["datetime"],axis=1,inplace=True)

### Label Encoding
data channel與article type為categorical data，必須對其進行轉換，因為我主要的方法為lightgbm，只需使用Label Encoding配上標示那些資料為類別資料即可。

In [None]:
y = df_train["Popularity"]
df_train.drop(["Popularity"],axis=1,inplace=True)
df_train_6.drop(["Popularity"],axis=1,inplace=True)
df_train_8.drop(["Popularity"],axis=1,inplace=True)

In [None]:
n_tr = df_train.shape[0]
cate_col = ["data_channel","article_type"]

##
tr_te = pd.concat([df_train, df_test],axis=0)
for col in cate_col:
    lbl = LabelEncoder()
    tr_te[col] = lbl.fit_transform(tr_te[col])

df_train = tr_te.iloc[:n_tr,:]
df_test = tr_te.iloc[n_tr:,:]

##
tr_te = pd.concat([df_train_6, df_test_6],axis=0)
for col in cate_col:
    lbl = LabelEncoder()
    tr_te[col] = lbl.fit_transform(tr_te[col])

df_train_6 = tr_te.iloc[:n_tr,:]
df_test_6 = tr_te.iloc[n_tr:,:]

##
tr_te = pd.concat([df_train_8, df_test_8],axis=0)
for col in cate_col:
    lbl = LabelEncoder()
    tr_te[col] = lbl.fit_transform(tr_te[col])

df_train_8 = tr_te.iloc[:n_tr,:]
df_test_8 = tr_te.iloc[n_tr:,:]

del tr_te
gc.collect()

### 將時間變數設為Dummy variable
我們另外嘗試過將時間變數使用dummy variable的方式進行處理，配上XGBoost可以獲得不錯的效果。

In [None]:
ti = []; ti_test = []; week = []; week_test = [] 
for i in range(len(X_train)) : 
    soup = BeautifulSoup(X_train.iloc[i],'html.parser')
    week.append(soup.time.attrs['datetime'][0:3])
    ti.append(soup.time.string)    
for i in range(1585) :
    soup_test = BeautifulSoup(X_test.iloc[i],'html.parser')
    week_test.append(soup_test.time.attrs['datetime'][0:3])
    ti_test.append(soup_test.time.string) 
for i in range(1586,len(X_test),1) :
    soup_test = BeautifulSoup(X_test.iloc[i],'html.parser')
    week_test.append(soup_test.time.attrs['datetime'][0:3])
    ti_test.append(soup_test.time.string) 
temp = week_test[1585:11846]
temp_ti = ti_test[1585:11846]
week_test[1585] = 'Mon'
ti_test[1585] = '2013-06-19 15:04:30 UTC' 
week_test[1586:len(X_test)] =  temp
ti_test[1586:len(X_test)] = temp_ti

month = []; month_test = []; hour = []; hour_test = [];year = []; year_test = [];
for i in range(len(X_train)) : 
    year.append(preprocessor_ta(ti[i])[0:4])
    month.append(preprocessor_ta(ti[i])[5:7])
    hour.append(preprocessor_ta(ti[i])[11:13])

for i in range(len(X_test)) :  
    year_test.append(preprocessor_ta(ti_test[i])[0:4])
    month_test.append(preprocessor_ta(ti_test[i])[5:7])    
    hour_test.append(preprocessor_ta(ti_test[i])[11:13])
year[27314] = '2013'

day = [];day_test = []; sec = []; sec_test = []
for i in range(len(X_train)) : 
    day.append(preprocessor_ta(ti[i])[8:10])
    sec.append(preprocessor_ta(ti[i])[14:16])
for i in range(len(X_test)) :  
    day_test.append(preprocessor_ta(ti_test[i])[8:10])
    sec_test.append(preprocessor_ta(ti_test[i])[14:16])
    
week = pd.get_dummies(week)
week_test = pd.get_dummies(week_test)
month = pd.get_dummies(month)
month_test = pd.get_dummies(month_test)
hour = pd.get_dummies(hour)
hour_test = pd.get_dummies(hour_test)
year = pd.get_dummies(year)
year_test = pd.get_dummies(year_test)
day1 = pd.get_dummies(day)
day1_test = pd.get_dummies(day_test)

### 時間dummy variable的交互作用項
這些變數間因為都是有或沒有(1 or 0)，故我們去對他們做相乘會得到兩個的交集會是1，那這樣也會是個很有幫助的變數，例如時間變數的相乘，就可以得到更詳細的時間變數。

In [None]:
YA = pd.DataFrame(); YA_test = pd.DataFrame()
for i in range(len(year.columns)):
    for j in range(len(article.columns)):
        YA = pd.concat([YA,year.iloc[:,i]*article.iloc[:,j]],axis = 1)
        YA_test = pd.concat([YA_test,year_test.iloc[:,i]*article_test.iloc[:,j]],axis = 1)
MY = pd.DataFrame(); MY_test = pd.DataFrame()
for i in range(len(month.columns)):
    for j in range(len(year.columns)):
        MY = pd.concat([MY,month.iloc[:,i]*year.iloc[:,j]],axis = 1)
        MY_test = pd.concat([MY_test,month_test.iloc[:,i]*year_test.iloc[:,j]],axis = 1)
WY = pd.DataFrame(); WY_test = pd.DataFrame()
for i in range(len(week.columns)):
    for j in range(len(year.columns)):
        WY = pd.concat([WY,week.iloc[:,i]*year.iloc[:,j]],axis = 1)
        WY_test = pd.concat([WY_test,week_test.iloc[:,i]*year_test.iloc[:,j]],axis = 1)
AP = pd.DataFrame(); AP_test = pd.DataFrame()
for i in range(len(year.columns)):
    AP = pd.concat([AP,article.iloc[:,i]*np.abs(pol)],axis = 1)
    AP_test = pd.concat([AP_test,article_test.iloc[:,i]*np.abs(pol_test)],axis = 1)
DY = pd.DataFrame(); DY_test = pd.DataFrame()
for i in range(len(day1.columns)):
    for j in range(len(year.columns)):
        DY = pd.concat([DY,day1.iloc[:,i]*year.iloc[:,j]],axis = 1)
        DY_test = pd.concat([DY_test,day1_test.iloc[:,i]*year_test.iloc[:,j]],axis = 1)

## Text Feature Extraction
我使用數種方法進行Text Feature Extraction，包含TF-IDF、TFIDF+SVD、TF+主題模型。

### TFIDF+SVD (Method 1)
我將內容、標題、主題分別計算TF-IDF。因為維度過高，我使用SVD分解 (即LSA) 進行降維，維度個數根據變異解釋比例決定，分別選到1000、1000、500。

In [None]:
## content
content_tfidf_18 = TfidfVectorizer(ngram_range=(1, 1),
                                min_df=20,
                                max_df=0.4,
                                tokenizer=tokenizer_stem)

content_tr_tf_18 = content_tfidf_18.fit_transform(df_train["content"])
content_te_tf_18 =content_tfidf_18.transform(df_test["content"])

## title
title_tfidf_18 = TfidfVectorizer(ngram_range=(1, 3),
                              min_df=2,
                              max_df=0.95,
                              tokenizer=tokenizer_stem)

title_tr_tf_18 = title_tfidf_18.fit_transform(df_train["title"])
title_te_tf_18 = title_tfidf_18.transform(df_test["title"])

## topics
topics_tfidf_18 = TfidfVectorizer(ngram_range=(1,1),
                               min_df=2,
                               tokenizer=tokenizer_stem)

topics_tr_tf_18 = topics_tfidf_18.fit_transform(df_train["topics"])
topics_te_tf_18 = topics_tfidf_18.transform(df_test["topics"])

In [None]:
## content
svder = TruncatedSVD(n_components=1000, n_iter=10, random_state=4342)
svder.fit(content_tr_tf_18)

content_svd_tr_18 = svder.transform(content_tr_tf_18)
content_svd_te_18 = svder.transform(content_te_tf_18)

## title
svder = TruncatedSVD(n_components=1000, n_iter=10, random_state=2342)
svder.fit(title_tr_tf_18)

title_svd_tr_18 = svder.transform(title_tr_tf_18)
title_svd_te_18 = svder.transform(title_te_tf_18)

## topics
svder = TruncatedSVD(n_components=500, n_iter=10, random_state=4352)
svder.fit(topics_tr_tf_18)

topics_svd_tr_18 = svder.transform(topics_tr_tf_18)
topics_svd_te_18 = svder.transform(topics_te_tf_18)

### Title+Topics (Method 2)
因為內容資料眾多，因此我嘗試只使用標題與主題並各別計算TF-IDF，因為這兩個能夠有效率的代表文章本身，其相較文章內容更為精鍊。

In [None]:
## title
title_tfidf_22 = TfidfVectorizer(ngram_range=(1, 3),
                              min_df=3,
                              max_df=0.95,
                              tokenizer=tokenizer_stem)

title_tr_tf_22 = title_tfidf_22.fit_transform(df_train_6["title"])
title_te_tf_22 = title_tfidf_22.transform(df_test_6["title"])

title_feature_name_22 = ["_title_"+i for i in title_tfidf_22.get_feature_names()]

## topics
topics_tfidf_22 = TfidfVectorizer(ngram_range=(1,1),
                               min_df=2,
                               tokenizer=tokenizer_stem)

topics_tr_tf_22 = topics_tfidf_22.fit_transform(df_train_6["topics"])
topics_te_tf_22 = topics_tfidf_22.transform(df_test_6["topics"])

topics_feature_name_22 = ["_topics_"+i for i in topics_tfidf_22.get_feature_names()]

### Latent Dirichlet Allocation1 (Method 3)
因為文章的主題很有可能影響文章的人氣，相較於單純使用文章的標籤主題，我嘗試使用Topics model來訓練出整個頁面文字形成的主題。
<br/>我先嘗試對標題、內容、主題各自使用LDA，主題個數皆為100。

In [None]:
## content
content_tfidf_26 = CountVectorizer(ngram_range=(1, 1),
                                min_df=20,
                                max_df=0.4,
                                tokenizer=tokenizer_stem)

content_tr_tf_26 = content_tfidf_26.fit_transform(df_train_6["content"])
content_te_tf_26 =content_tfidf_26.transform(df_test_6["content"])

## title
title_tfidf_26 = CountVectorizer(ngram_range=(1, 3),
                              min_df=20,
                              tokenizer=tokenizer_stem)

title_tr_tf_26 = title_tfidf_26.fit_transform(df_train_6["title"])
title_te_tf_26 = title_tfidf_26.transform(df_test_6["title"])

## topics
topics_tfidf_26 = CountVectorizer(ngram_range=(1,2),
                               min_df=20,
                               tokenizer=tokenizer_stem)

topics_tr_tf_26 = topics_tfidf_26.fit_transform(df_train_6["topics"])
topics_te_tf_26 = topics_tfidf_26.transform(df_test_6["topics"])

In [None]:
## content
ldaer_content_26 = LatentDirichletAllocation(n_components=100, n_jobs=2,max_iter=20)

ldaer_content_26.fit(content_tr_tf_26)
content_tr_lda_26 = ldaer_content_26.transform(content_tr_tf_26)
content_te_lda_26 = ldaer_content_26.transform(content_te_tf_26)

## title
ldaer_title_26 = LatentDirichletAllocation(n_components=100, n_jobs=2)

ldaer_title_26.fit(title_tr_tf_26)
title_tr_lda_26 = ldaer_title_26.transform(title_tr_tf_26)
title_te_lda_26 = ldaer_title_26.transform(title_te_tf_26)

## topics
ldaer_topic_26 = LatentDirichletAllocation(n_components=100, n_jobs=2)

ldaer_topic_26.fit(topics_tr_tf_26)
topic_tr_lda_26 = ldaer_topic_26.transform(topics_tr_tf_26)
topic_te_lda_26 = ldaer_topic_26.transform(topics_te_tf_26)

### Latent Dirichlet Allocation2 (Method 4)
接著，我將標題、內容、主題合併，並使用TF轉為字頻並使用LDA找出文章可能的主題，經過CV，主題個數選為100個。

In [None]:
df_train_6["txt"] = df_train_6["content"] +" "+ df_train_6["title"]+ " "+df_train_6["topics"]
df_test_6["txt"] = df_test_6["content"] +" "+ df_test_6["title"]+ " "+df_test_6["topics"]

In [None]:
txt_tfidf = CountVectorizer(ngram_range=(1, 1),
                            min_df=20,
                            max_df=0.4,
                            tokenizer=tokenizer_stem)

txt_tr_tf = txt_tfidf.fit_transform(df_train_6["txt"])
txt_te_tf = txt_tfidf.transform(df_test_6["txt"])

In [None]:
n_top = 100
ldaer_txt = LatentDirichletAllocation(n_components=n_top, n_jobs=2,max_iter=20)

ldaer_txt.fit(txt_tr_tf)
txt_tr_lda = ldaer_txt.transform(txt_tr_tf)
txt_te_lda = ldaer_txt.transform(txt_te_tf)

txt_names = ["content_"+str(i) for i in range(n_top)]

### TF-IDF+max_feature (Method 5)
在TF-IDF中挑選出最多100個字，避免太多字加進模型裡，效果不見得比較好，此外我們也計算這些字在文章出現的個數來當作重要變數。

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,1),min_df = 0.1,max_df = 0.2,max_features = 100,
                        preprocessor=preprocessor_ta,
                        tokenizer=tokenizer_stem_nostop)

tfidf.fit(X_train)
feature_idf = tfidf.get_feature_names()
pre_idf = []; pre_idf_test = []
for i in range(len(X_train)):
    pre_idf.append(tokenizer_stem_nostop(preprocessor_ta(X_train[i])))
for i in range(len(X_test)):
    pre_idf_test.append(tokenizer_stem_nostop(preprocessor_ta(X_test[i])))
    
idf = [];idf_test = []
for i in range(len(X_train)):
    num = 0
    for j in range(len(tfidf.get_feature_names())):
        num = num + pre_idf[i].count(feature_idf[j])
    idf.append(num)
for i in range(len(X_test)):
    num = 0
    for j in range(len(tfidf.get_feature_names())):
        num = num + pre_idf_test[i].count(feature_idf[j])
    idf_test.append(num)  
    
idf_num = [];idf_num_test = []
for i in range(len(X_train)):
    num = 0
    for j in range(len(tfidf.get_feature_names())):
        num = num + int(np.where((feature_idf[j] in pre_idf[i])==True,1,0))
    idf_num.append(num)

for i in range(len(X_test)):
    num = 0
    for j in range(len(tfidf.get_feature_names())):
        num = num + int(np.where((feature_idf[j] in pre_idf_test[i])==True,1,0))
    idf_num_test.append(num) 

合併資料

In [None]:
df_train.drop(["content","title","topics"],axis=1,inplace=True)
df_test.drop(["content","title","topics"],axis=1,inplace=True)

df_train_6.drop(["content","title","topics","txt"],axis=1,inplace=True)
df_test_6.drop(["content","title","topics","txt"],axis=1,inplace=True)

df_train_8.drop(["content","title","topics"],axis=1,inplace=True)
df_test_8.drop(["content","title","topics"],axis=1,inplace=True)

In [None]:
## combine model 1
new_tr_18 = np.hstack((content_svd_tr_18, title_svd_tr_18, topics_svd_tr_18))
new_te_18 = np.hstack((content_svd_te_18, title_svd_te_18, topics_svd_te_18))

content_names = ["content_"+str(i) for i in range(1000)]
title_names = ["title_"+str(i) for i in range(1000)]
topics_names = ["topics_"+str(i) for i in range(500)]

feature_names_18 = content_names+title_names+topics_names

## combine model 2
feature_names_22 = df_train_6.columns.to_list()
feature_names_22.extend(title_feature_name_22+topics_feature_name_22)

new_tr_22 = hstack((df_train_6.values, title_tr_tf_22, topics_tr_tf_22)).tocsr()
new_te_22 = hstack((df_test_6.values, title_te_tf_22, topics_te_tf_22)).tocsr()

## combine model 3
new_tr_26 = np.hstack((content_tr_lda_26, title_tr_lda_26, topic_tr_lda_26))
new_te_26 = np.hstack((content_te_lda_26, title_te_lda_26, topic_te_lda_26))

content_names = ["content_"+str(i) for i in range(100)]
title_names = ["title_"+str(i) for i in range(100)]
topics_names = ["topics_"+str(i) for i in range(100)]

feature_names_26 = content_names+title_names+topics_names

## combine model 4
feature_names_28 = df_train_6.columns.to_list()
feature_names_28.extend(txt_names)

new_tr_28 = pd.concat([df_train_6,pd.DataFrame(txt_tr_lda)],axis=1)
new_te_28 = pd.concat([df_test_6,pd.DataFrame(txt_te_lda)],axis=1)

## combine model 5
feature_names_37 = df_train_8.columns.to_list()
feature_names_37.extend(txt_names)

new_tr_37 = pd.concat([df_train_8,pd.DataFrame(txt_tr_lda)],axis=1)
new_te_37 = pd.concat([df_test_8,pd.DataFrame(txt_te_lda)],axis=1)

In [None]:
X_train_combined = pd.concat([month,year,week,hour,pd.DataFrame(article),
                             YA,MY,WY,AP,DY,pd.DataFrame(pol),
                             pd.DataFrame(idf*np.abs(pol))],axis = 1)
X_test_combined = pd.concat([month_test,year_test,week_test,hour_test,pd.DataFrame(article_test),
                            YA_test,MY_test,WY_test,AP_test,DY_test,pd.DataFrame(pol_test),
                            pd.DataFrame(idf_test*np.abs(pol_test))],axis = 1)

# Build Classifiers
我們使用的模型包含LightGBM與XGBoost，這兩種方法皆為Gradient Boosting Tree的演算法，經過我們測試許多演算法，包含Ridge、RandomForest、SVM、KNN之後，這類方法的效果最好。
<br/>我們透過使用Stacking的方式合併多個分類器，因此我們有許多不同的方法與Feature進行modeling的結果，合併的部分會在後面說明。

In [None]:
def quick_stack_lgb(param,tr,y,te,feature_names,cate_col=None,nfold=10):
    kf = StratifiedKFold(n_splits=nfold, shuffle=True)
    pred_tr = np.zeros(tr.shape[0])
    pred_te = np.zeros(te.shape[0])

    feature_importance_df = pd.DataFrame()
    
    for fold_, (train_index, valid_index) in enumerate(kf.split(tr, y)):
        if cate_col is None:
            d_tr = lgb.Dataset(tr[train_index,:],
                           y[train_index],
                           feature_name=feature_names)
        else:
            d_tr = lgb.Dataset(tr[train_index,:],
                               y[train_index],
                               feature_name=feature_names,
                               categorical_feature = cate_col)

        clf = lgb.train(param, d_tr)
        pred_tr[valid_index] = clf.predict(tr[valid_index,:])
        pred_te += clf.predict(te)/nfold

        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = feature_names
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    return pred_tr, pred_te, feature_importance_df

def quick_sklearn(model,tr,y,te,nfold=10):
    kf = StratifiedKFold(n_splits=nfold, shuffle=True)
    pred_tr = np.zeros(tr.shape[0])
    pred_te = np.zeros(te.shape[0])
    
    for fold_, (train_index, valid_index) in enumerate(kf.split(tr, y)):
        temp_tr = tr[train_index,:]
        temp_tr_y = y[train_index]
        temp_val = tr[valid_index,:]
        
        model.fit(temp_tr,temp_tr_y)
        
        pred_tr[valid_index] = model._predict_proba_lr(temp_val)[:,1]
        pred_te += model._predict_proba_lr(te)[:,1]/nfold
        
    return pred_tr, pred_te

def quick_stacking_xgb(train, y_train, test, n_fold=5):
    kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1)
    tr_pred = np.zeros(train.shape[0])
    te_pred = np.zeros(test.shape[0])
    
    for fold_, (tr_idx,val_idx) in enumerate(kfold.split(train,y_train)):
        print(fold_+1)
        tr_x = train.iloc[tr_idx,:]
        tr_y = y_train[tr_idx]
        val_x = train.iloc[val_idx,:]

        model1 = XGBClassifier(n_estimators=300,learning_rate=0.01,max_depth = 8,colsample_bytree=0.7)
        model1.fit(tr_x,tr_y)

        tr_pred[val_idx] = model1.predict_proba(val_x)[:,1]
        te_pred += model1.predict_proba(test)[:,1]/n_fold
    
    return tr_pred,te_pred

## Hyperparameter Tuning

我們使用Grid Search的方式來調整Hyperparameter，因為LightGBM與XGBoost都有Scikit-learn API與Training API，若使用Training API則會無法使用Scikit-learn的GridSearchCV，此時便要自己寫方法來進行調參，因此我自己寫了方法進行Grid Search。

In [None]:
from itertools import product

def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())],
                        columns=dictionary.keys())

def convertType(param, typeDict):
    inputParm = param.copy()

    for key in inputParm.keys():
        if typeDict[key] == np.int64:
            inputParm[key] = int(inputParm[key])
        elif typeDict[key] == np.float64:
            inputParm[key] = float(inputParm[key])
        else:
            print("Not yet")

    return inputParm

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 1,
    "bagging_freq":1,
    "learning_rate": 0.01,
    "num_boost_round": 10000,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2
}

param_cv = {
    'num_leaves': [20,32,40],
    "min_data_in_leaf": [40,50,60,70],
    "feature_fraction": [0.3,0.5,0.8,1],
    "bagging_fraction":[0.6,0.8,1],
}

param_cv_pd = expand_grid(param_cv)
theType = param_cv_pd.dtypes.to_dict()
auc_list = []

for i in tqdm(range(param_cv_pd.shape[0])):
    d_all = lgb.Dataset(new_tr,y,
                        feature_name=feature_names,
                       categorical_feature = cate_col)
    
    nowParam = param_cv_pd.loc[i, :].to_dict()
    nowParam = convertType(nowParam,theType)
    print("\n", nowParam,"\n")
    
    total_param = param.copy()
    total_param.update(nowParam)
    
    clf_cv = lgb.cv(total_param,
                    d_all,
                    nfold=5,
                    early_stopping_rounds=100,
                    shuffle=True,
                    metrics ="auc")
    
    auc_list.append(max(clf_cv["auc-mean"]))
    print("%.6f with %4d boost" % (max(clf_cv["auc-mean"]),len(clf_cv["auc-mean"])))
    gc.collect()
    
    if len(clf_cv["auc-mean"])==total_param["num_boost_round"]:
        print("num_boost_round too low")

In [None]:
top_index = auc_list.index(max(auc_list))
best_param = param_cv_pd.loc[top_index, :].to_dict()

print(max(auc_list))
print(best_param)

param_cv_pd["AUC"] = auc_list
param_cv_pd

找到合適的Hyperparameter後，便可進行之後的建模。

## Modeling
### Model 1 (SVD+LightGBM)
針對只使用TF-IDF+SVD萃取出的資料建模。測試過後發現若與其他Feature一起建模效果會不好，因此不使用任何先前Feature engineering的資料。
<br/>這個方法的5-fold CV結果約為0.544628、Public Leadboard約為0.54489。

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq":1,
    "learning_rate": 0.01,
    "num_boost_round": 109,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2
}

tr_pred_18, te_pred_18, featureImp = quick_stack_lgb(param=param,
                                           tr=new_tr_18,
                                           y=y,
                                           te=new_te_18,
                                           feature_names=feature_names_18,
                                           nfold=10)

### Model 2 (TFIDF+Features+LightGBM)
因為內容的資料十分龐大，因此我嘗試只使用標題與主題各別計算TF-IDF並配上Feature engineering的Feature進行建模，效果意外的好。
<br/>這個方法的5-fold CV結果約為0.607653、Public Leadboard約為0.59039。

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 1,
    "bagging_freq":1,
    "learning_rate": 0.005,
    "num_boost_round": 461,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2,
    "random_state":192
}

tr_pred_22, te_pred_22, featureImp = quick_stack_lgb(param=param,
                                           tr=new_tr_22,
                                           y=y,
                                           te=new_te_22,
                                           feature_names=feature_names_22,
                                           cate_col=cate_col,
                                           nfold=10)

### Model 3  (LDA1+LightGBM)
根據我的觀察，長文章容易被判定為有人氣，但有部分有人氣的文章卻因為是短文章而被給予很低的機率，因此我認為若要再提升成績，可能要以文章內容著手，而我覺得可能會造成影響的是文章內容本身的主題，不單單是文章的標籤主題。
<br/>因此我嘗試只使用LDA萃取出的主題資料，並根據先前SVD的經驗先不使用任何先前Feature engineering的資料變建模。
<br/>這個方法的5-fold CV結果約為0.545384，沒有將此結果上傳過，因此Public Leadboard未知。

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 1,
    "bagging_freq":1,
    "learning_rate": 0.01,
    "num_boost_round": 287,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2
}

tr_pred_26, te_pred_26, featureImp = quick_stack_lgb(param=param,
                                           tr=new_tr_26,
                                           y=y,
                                           te=new_te_26,
                                           feature_names=feature_names_26,
                                           nfold=10)

### Model 4  (LDA2+Features+LightGBM)
接著，我再使用LDA萃取出的主題資料加上Feature engineering的Feature進行建模，AUC有明顯的提升，與我先前的猜測相符。
<br/>這個方法的5-fold CV結果約為0.608212、Public Leadboard約為0.59314。

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 20,
    "feature_fraction": 0.5,
    "bagging_fraction": 1,
    "bagging_freq":1,
    "learning_rate": 0.005,
    "num_boost_round": 847,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2,
    "random_state":192
}

tr_pred_28, te_pred_28, featureImp = quick_stack_lgb(param=param,
                                           tr=new_tr_28.to_numpy(),
                                           y=y,
                                           te=new_te_28.to_numpy(),
                                           feature_names=feature_names_28,
                                           nfold=10)

### Model 5 (LDA2+Features+LightGBM)
因為我發現文章時間至年底的間隔的Feature與文章人氣有明顯相關，所以我使用LDA萃取出的主題資料加上Feature engineering的Feature並額外再加上文章時間至年底的間隔的Feature，以進行建模，雖然Leadboard的成績沒有上升，但Cross-validation卻有上升，因此我還是選擇將其保留。
<br/>這個方法的5-fold CV結果約為0.60853、Public Leadboard約為0.58926。

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 20,
    "feature_fraction": 0.5,
    "bagging_fraction": 1,
    "bagging_freq":1,
    "learning_rate": 0.005,
    "num_boost_round": 537,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2
}

tr_pred_37, te_pred_37, featureImp = quick_stack_lgb(param=param,
                                           tr=new_tr_37.to_numpy(),
                                           y=y,
                                           te=new_te_37.to_numpy(),
                                           feature_names=feature_names_37,
                                           nfold=10)

### Model 6 (max_feature_100+Features+XGBoost)
使用XGBoost對於使用max_feature為100的資料配上Feature engineering的Feature進行建模。
<br/>這個方法的5-fold CV結果約為0.589288。

In [None]:
tr_pred_yang, te_pred_yang = quick_stacking_xgb(X_train_combined,y,X_test_combined)

## Feature Selection
我們有嘗試過進行Feature selection，但效果不進理想，不只沒有提升AUC，反而還使AUC稍微降低，因此沒有將結果納入最後的結果中。

In [None]:
param = {
    "num_leaves": 32,
    'min_data_in_leaf': 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq":1,
    "learning_rate": 0.01,
    "num_boost_round": 162,
    "objective": "binary",
    "boosting": "gbdt",
    "metric": "auc",
    "verbosity": -1,
    "nthread": 2,
    "random_state":192
}

d_all = lgb.Dataset(new_tr,y,
                    feature_name=feature_names,
                    categorical_feature = cate_col)

lgb_clf = lgb.train(param, d_all)

In [None]:
def select_feature(important, all_feature):
    output = []
    output_name = []
    imp_set = set(important)
    
    for i in range(len(all_feature)):
        if all_feature[i] in imp_set:
            output.append(i)
            output_name.append(all_feature[i])

    return output, output_name

In [None]:
imp_index, imp_name = select_feature(temp["Feature"].tolist(),feature_names)

output_tr = new_tr[:,imp_index].todense()
output_te = new_te[:,imp_index].todense()

output_tr = pd.DataFrame(output_tr)
output_te = pd.DataFrame(output_te)

output_tr.columns = imp_name
output_te.columns = imp_name

## Stacking
我們透過使用Stacking的方式合併多個分類器，其原理類似於voting，但權重是使用Cross-validation算出的資料再使用其他模型訓練出來，因此其權重會相較單純取平均要更好許多。
<br/>因為Stacking的模型不需要太複雜即可有很好的結果，因此我們使用Ridge Regression進行stacking。

In [None]:
tr_selected = pd.DataFrame()
te_selected = pd.DataFrame()

for ind, (tr,te) in enumerate(zip([tr_pred_18,tr_pred_22,tr_pred_26,tr_pred_28,tr_pred_37],
                                  [te_pred_18,te_pred_22,te_pred_26,te_pred_28,te_pred_37])):
    tr_selected[str(ind)] = tr
    te_selected[str(ind)] = te

In [None]:
clf = RidgeClassifierCV(alphas=np.arange(0.001, 2, 0.005),
                        cv=10,
                        scoring="roc_auc",
                        normalize=True,
                        fit_intercept=True)
clf.fit(tr_selected, y)

clf_ridge = RidgeClassifier(alpha=clf.alpha_,
                            normalize=True,
                            fit_intercept=True)

tr_pred, te_pred = quick_sklearn(clf_ridge,
                                tr_selected.to_numpy(),
                                y,
                                te_selected.to_numpy(),
                                10)

In [None]:
output = pd.read_csv("./data/original/sample_submission.csv")
output["Popularity"]=te_pred
output.to_csv("./result.csv",index=False)

此方法的10-Fold CV AUC為0.61140，比先前的幾個模型都要來得好，因此使用其為最終結果。

# Conclusion
在這次的競賽中，我們認為有趣的點、學習與觀察到的陷阱如以下：
1. 原始文字資料中可能會出現前述的javascript的陷阱，必須要靠一篇一篇看得方式才能夠看出，表示我們就算有這些厲害的演算法，仍然需要多加觀察資料本身。
2. 即便原始分數沒有很高的方法，透過stacking或voting的方式仍然可以對最後的結果有幫助，這顯現了ensemble learning方法的精隨。
3. Feature engineering的過程可以提高AUC許多，讓我了解到其重要性。
4. SVD若直接與數值Feature一起建模效果很差，但若只用SVD的資料建模，最後再與其他資料合併，效果卻會很好。讓我了解到不要一個方法看起來沒有效果就直接捨棄，將其用不同方式利用可能有意想不到的效果。 
5. 最後的Kaggle排名有十分明顯的波動，表示可能有些人掉入了overfitting的陷阱中，讓我更了解到以後必須警惕。