## データとライブラリの読み込み

In [2]:
import pandas as pd
import pandas_profiling
import numpy as np

#google drive読み込み
# from google.colab import drive, files
# drive.mount('/content/drive')
# dataPath = "/content/drive/My Drive/01 Lab/novel_bookmark/datasets/"
# test = pd.read_csv(dataPath + 'test.csv')
# train = pd.read_csv(dataPath +'train.csv')
# sub = pd.read_csv(dataPath +'sample_submission.csv')

#local 読み込み
dataPath = './datasets/'
titlePath = './data/titledata/'

test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

## データの可視化、確認

In [12]:
#データの可視化
# !pip install pandas-profiling
# !pip install --upgrade pandas_profiling
# train.profile_report()

In [13]:
# df = train
# print(len(df[train['fav_novel_cnt_bin']==4]))

# sub = df[train['fav_novel_cnt_bin']==4]
# sub.to_csv('pre_file.csv', index=False)
# files.download('pre_file.csv')

## 特徴量エンジニアリング

In [None]:
test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

data = pd.concat([train, test], sort=False)#データの統合

#必要のないカラムを削除
#ncode:小説コード, writer, userid, pc_or_k:小説の投稿機種 
delete_columns = ['ncode', 'writer', 'userid', 'pc_or_k']
data.drop(delete_columns, axis=1, inplace=True)

#novel_type (1, 2)を(0, 1)に変更
data['novel_type'] = data['novel_type'].map({1: 0, 2: 1})

#'general_firstup'(初回投稿日)を今日までの日数に変換
import datetime
def calc_diff_days(x):#return days(int)
  y_m_d = x.split(" ")[0]
  y, m, d = map(int, y_m_d.split("-"))
  ymd = datetime.datetime(y, m, d)
  diff = datetime.datetime.today() - ymd
  return diff.days
data['general_firstup'] = data['general_firstup'].map(calc_diff_days)

#一時的にテキストデータを削除
text_columns = ['title','story', 'keyword']
data.drop(text_columns, axis=1, inplace=True)

#titleデータをベクトルに変換したものを追加
title_data = pd.read_csv(titlePath+'title_vec_learnedModel.csv', index_col=0)
data = pd.merge(data, title_data, left_index=True, right_index=True)

#dataをtest, trainに分割
train = data[:len(train)]
test = data[len(train):]

train.to_csv('./tmp/tmp_train.csv')
data.head()

### 訓練データと答えに分割

In [18]:
#一時的に欠損値を含むものを削除
train = train.dropna(how='any')

#data分割
x_train = train.drop('fav_novel_cnt_bin', axis=1)#訓練データ
y_train = train['fav_novel_cnt_bin']#訓練データの答え

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

## 勾配ブースティング


In [19]:
import lightgbm as lgb

categorical_features = ['biggenre', 'genre']
lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)

params = {
    'objective': 'binary'
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)
pred = model.predict(x_test, num_iteration=model.best_iteration)



[LightGBM] [Info] Number of positive: 11080, number of negative: 9045
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51297
[LightGBM] [Info] Number of data points in the train set: 20125, number of used features: 210
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550559 -> initscore=0.202930
[LightGBM] [Info] Start training from score 0.202930
Training until validation scores don't improve for 10 rounds




[10]	training's binary_logloss: 0.555533	valid_1's binary_logloss: 0.568331
[20]	training's binary_logloss: 0.516254	valid_1's binary_logloss: 0.543378
[30]	training's binary_logloss: 0.491135	valid_1's binary_logloss: 0.534317
[40]	training's binary_logloss: 0.471817	valid_1's binary_logloss: 0.52932
[50]	training's binary_logloss: 0.454242	valid_1's binary_logloss: 0.527133
[60]	training's binary_logloss: 0.438676	valid_1's binary_logloss: 0.526636
[70]	training's binary_logloss: 0.424587	valid_1's binary_logloss: 0.525682
[80]	training's binary_logloss: 0.410665	valid_1's binary_logloss: 0.525257
[90]	training's binary_logloss: 0.39827	valid_1's binary_logloss: 0.525917
Early stopping, best iteration is:
[80]	training's binary_logloss: 0.410665	valid_1's binary_logloss: 0.525257


In [20]:

pred_to_fav = lambda x: x*10//2

# y_pred = y_test.copy()
# y_pred['fav_novel_cnt_bin'] = list(map(pred_to_fav, pred))
# y_pred.drop(index='fav_novel_cnt_bin', axis=0)
# y_pred['fav_novel_cnt_bin']=pd.to_numeric(y_pred['fav_novel_cnt_bin'])

y_pred = pred_to_fav(pred)
y_test_array = y_test.values


#正答率の確認
from sklearn.metrics import accuracy_score
print('accuracy = ', accuracy_score(y_true=y_test_array, y_pred=y_pred))


accuracy =  0.2064
