## データとライブラリの読み込み

In [2]:
import pandas as pd
import pandas_profiling
import numpy as np

#google drive読み込み
# from google.colab import drive, files
# drive.mount('/content/drive')
# dataPath = "/content/drive/My Drive/01 Lab/novel_bookmark/datasets/"
# test = pd.read_csv(dataPath + 'test.csv')
# train = pd.read_csv(dataPath +'train.csv')
# sub = pd.read_csv(dataPath +'sample_submission.csv')

#local 読み込み
dataPath = './datasets/'
titlePath = './data/titledata/'

test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

## データの可視化、確認

In [12]:
#データの可視化
# !pip install pandas-profiling
# !pip install --upgrade pandas_profiling
# train.profile_report()

In [13]:
# df = train
# print(len(df[train['fav_novel_cnt_bin']==4]))

# sub = df[train['fav_novel_cnt_bin']==4]
# sub.to_csv('pre_file.csv', index=False)
# files.download('pre_file.csv')

## 特徴量エンジニアリング

In [97]:
test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

data = pd.concat([train, test], sort=False)#データの統合
data = data.reset_index()#indexが重複するためindexを振り直し

#必要のないカラムを削除
#ncode:小説コード, writer, userid, pc_or_k:小説の投稿機種
delete_columns = ['ncode', 'writer', 'userid', 'pc_or_k']
data.drop(delete_columns, axis=1, inplace=True)

#novel_type (1, 2)を(0, 1)に変更
data['novel_type'] = data['novel_type'].map({1: 0, 2: 1})

#'general_firstup'(初回投稿日)を今日までの日数に変換
import datetime
def calc_diff_days(x):#return days(int)
  y_m_d = x.split(" ")[0]
  y, m, d = map(int, y_m_d.split("-"))
  ymd = datetime.datetime(y, m, d)
  diff = datetime.datetime.today() - ymd
  return diff.days
data['general_firstup'] = data['general_firstup'].map(calc_diff_days)

#一時的にテキストデータを削除
text_columns = ['title','story', 'keyword']
data.drop(text_columns, axis=1, inplace=True)

#titleデータをベクトルに変換したものを追加
title_data = pd.read_csv(titlePath+'title_vec_learnedModel.csv', index_col=0)
data = pd.merge(data, title_data, left_index=True, right_index=True, sort=False, how='inner')

#dataをtest, trainに分割
train = data[:len(train)]
test = data[len(train):]


### アンダーサンプリング

In [None]:
#一時的に欠損値を含むものを削除
train = train.dropna(how='any')

train0 = train[train['fav_novel_cnt_bin']==0]
train1 = train[train['fav_novel_cnt_bin']==1]
train2 = train[train['fav_novel_cnt_bin']==2]
train3 = train[train['fav_novel_cnt_bin']==3]
train4 = train[train['fav_novel_cnt_bin']==4]

num = len(train4)
train = pd.concat([train0[:num], train1[:num], train2[:num], train3[:num], train4[:num]], sort=False)#データの統合
print(len(train))

### 訓練データと答えに分割

In [107]:
#一時的に欠損値を含むものを削除
train = train.dropna(how='any')

#data分割
x_train = train.drop('fav_novel_cnt_bin', axis=1)#訓練データ
y_train = train['fav_novel_cnt_bin']#訓練データの答え

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

## 勾配ブースティング


In [108]:
# モデルの学習
model = lgb.LGBMClassifier() # モデルのインスタンスの作成
model.fit(x_train, y_train) # モデルの学習

# テストデータの予測クラス (予測クラス(0 or 1 or 2)を返す)
y_pred = model.predict(x_valid)

# テストデータのクラス予測確率 (各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す)
y_pred_prob = model.predict_proba(x_valid)

#正答率の確認
from sklearn.metrics import accuracy_score
print('accuracy = ', accuracy_score(y_true=y_valid.values, y_pred=y_pred))

In [123]:

# test = test.drop('fav_novel_cnt_bin', axis=1)
y_pred = model.predict(test)
y_pred_prob = model.predict_proba(test)

predData = pd.DataFrame(y_pred_prob)
sub = pd.read_csv(dataPath +'sample_submission.csv')
sub['proba_0'] = predData[0]
sub['proba_1'] = predData[1]
sub['proba_2'] = predData[2]
sub['proba_3'] = predData[3]
sub['proba_4'] = predData[4]
sub.to_csv('./data/sub/2021-10-15.csv', index=False)

In [101]:
import lightgbm as lgb

categorical_features = ['biggenre', 'genre']
lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)

params = {
    'objective': 'binary'
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

# pred = model.predict(x_test, num_iteration=model.best_iteration)

[LightGBM] [Info] Number of positive: 2169, number of negative: 543
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51550
[LightGBM] [Info] Number of data points in the train set: 2712, number of used features: 211
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799779 -> initscore=1.384912
[LightGBM] [Info] Start training from score 1.384912
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.23615	valid_1's binary_logloss: 0.328257
[20]	training's binary_logloss: 0.146423	valid_1's binary_logloss: 0.296546
[30]	training's binary_logloss: 0.0995096	valid_1's binary_logloss: 0.290697
Early stopping, best iteration is:
[27]	training's binary_logloss: 0.111168	valid_1's binary_logloss: 0.290334


In [43]:

#0~4にマッピング
pred_to_fav = lambda x: x*10//2

pred = model.predict(x_test, num_iteration=model.best_iteration)

# y_pred = y_test.copy()
# y_pred['fav_novel_cnt_bin'] = list(map(pred_to_fav, pred))
# y_pred.drop(index='fav_novel_cnt_bin', axis=0)
# y_pred['fav_novel_cnt_bin']=pd.to_numeric(y_pred['fav_novel_cnt_bin'])

y_pred = pred_to_fav(pred)
y_test_array = y_test.values

#正答率の確認
from sklearn.metrics import accuracy_score
print('accuracy = ', accuracy_score(y_true=y_test_array, y_pred=y_pred))

# pred = model.predict(test, num_iteration=model.best_iteration)
# pred
test.to_csv('./tmp/a.csv')
# x_train.head()

accuracy =  0.2064
