### Module Import

In [1]:
import csv
import numpy as np
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

### Load data

In [2]:
def load_train_label(df, train_dt, label_dt):
    train_dt = ["dt%d" % i for i in train_dt]
    label_dt = ["dt%d" % (label_dt - 1), "dt%d" % (label_dt)]
    label = df[label_dt].values.tolist()
    idx = df['chid'].tolist()
    tag_feat = df[train_dt].values.tolist()
    for i in range(len(tag_feat)):
        tag_feat[i] = np.array([ast.literal_eval(x) for x in tag_feat[i]])
    for j in range(len(label)):
        label[j][1] = np.array(ast.literal_eval(label[j][1]))
    return idx, [i[0] for i in tag_feat], [i[1] for i in label]

In [None]:
# Using only dt=10~22 & dt=all case to train
# dt=22 as input feature, dt=23 as label
df_all = pd.read_csv("./data/dt_all.csv")
train1_idx, train1_tag, train1_label = load_train_label(df_all, [22,23], 23)
train1_idx, valid1_idx, train1_tag, valid1_tag, train1_label, valid1_label = train_test_split(train1_idx, train1_tag, train1_label, test_size=0.15)
df_10to22 = pd.read_csv("./data/dt_10to22_post.csv")
train2_idx, train2_tag, train2_label = load_train_label(df_10to22, [22,23], 23)

### Model and Train

In [5]:
clf = MultiOutputRegressor(GradientBoostingRegressor(random_state=0, verbose=1))
clf.fit(train1_tag, train1_label)
clf.fit(train2_tag, train2_label)

      Iter       Train Loss   Remaining Time 
         1           0.0225           21.43s
         2           0.0219           20.41s
         3           0.0215           20.10s
         4           0.0211           19.54s
         5           0.0207           18.80s
         6           0.0205           18.15s
         7           0.0203           18.60s
         8           0.0201           17.99s
         9           0.0199           17.56s
        10           0.0198           17.10s
        20           0.0193           14.66s
        30           0.0191           12.54s
        40           0.0190           10.61s
        50           0.0190            8.76s
        60           0.0189            7.00s
        70           0.0189            5.23s
        80           0.0189            3.48s
        90           0.0188            1.76s
       100           0.0188            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.0140           17.19s
        

MultiOutputRegressor(estimator=GradientBoostingRegressor(random_state=0,
                                                         verbose=1))

In [6]:
clf.score(valid1_tag, valid1_label)

0.24562976523042407

### Create result

In [7]:
def create_result(case, output, idx):
    with open("./result/result_%s.csv" % case, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['chid', 'top1', 'top2', 'top3'])
        pred1 = []
        pred2 = []
        pred3 = []
        tag_list = ["2","6","10","12","13","15","18","19","21","22","25","26","36","37","39","48"]
        for i in range(len(output)):
            #print(soft_predicted[i])
            res = np.argpartition(output[i], -3)[-3:]
            #print(res)
            #break
            pred1.append(tag_list[res[0]])
            pred2.append(tag_list[res[1]])
            pred3.append(tag_list[res[2]])
        for i, p1, p2, p3 in zip(idx, pred1, pred2, pred3):
            writer.writerow([str(i), str(p3), str(p2), str(p1)])

In [13]:
def load_test_label(df, label_dt):
    label_dt = ["dt%d" % (label_dt - 1), "dt%d" % (label_dt)]
    label = df[label_dt].values.tolist()
    idx = df['chid'].tolist()
    for j in range(len(label)):
        label[j][1] = np.array(ast.literal_eval(label[j][1]))
    return idx, [i[1] for i in label]

In [15]:
df_only1 = pd.read_csv("./data/dt_only1_post.csv")
df_less10 = pd.read_csv("./data/dt_less10_post.csv")
idx_only1, label_only1 = load_test_label(df_only1, 23)
idx_less10, label_less10 = load_test_label(df_less10, 23)
output_only1 = clf.predict(label_only1)
create_result("only1", output_only1, idx_only1)
output_less10 = clf.predict(label_less10)
create_result("less10", output_less10, idx_less10)

  interactivity=interactivity, compiler=compiler, result=result)
