## データとライブラリの読み込み

In [5]:
import pandas as pd
import pandas_profiling
import numpy as np

#google drive読み込み
# from google.colab import drive, files
# drive.mount('/content/drive')
# dataPath = "/content/drive/My Drive/01 Lab/novel_bookmark/datasets/"
# test = pd.read_csv(dataPath + 'test.csv')
# train = pd.read_csv(dataPath +'train.csv')
# sub = pd.read_csv(dataPath +'sample_submission.csv')

#local 読み込み
dataPath = './datasets/'
titlePath = './data/titledata/'
keyPath = './data/keyworddata/'
storyPath = './data/storydata/'
allPath = './data/all/'

test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

## 特徴量エンジニアリング

In [22]:
test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

data = pd.concat([train, test], sort=False)#データの統合
data = data.reset_index()#indexが重複するためindexを振り直し

print("len(data):", len(data))

#必要のないカラムを削除
#ncode:小説コード, writer, userid, pc_or_k:小説の投稿機種
delete_columns = ['ncode', 'writer', 'userid', 'pc_or_k']
data.drop(delete_columns, axis=1, inplace=True)

#novel_type (1, 2)を(0, 1)に変更
data['novel_type'] = data['novel_type'].map({1: 0, 2: 1})

#'general_firstup'(初回投稿日)を今日までの日数に変換
import datetime
def calc_diff_days(x):#return days(int)
  y_m_d = x.split(" ")[0]
  y, m, d = map(int, y_m_d.split("-"))
  ymd = datetime.datetime(y, m, d)
  diff = datetime.datetime.today() - ymd
  return diff.days
data['general_firstup'] = data['general_firstup'].map(calc_diff_days)

#テキストデータを削除
text_columns = ['title','story', 'keyword']
data.drop(text_columns, axis=1, inplace=True)

#titleデータをベクトルに変換したものを追加
# title_data = pd.read_csv(titlePath+'title_vec_learnedModel.csv', index_col=0)#学習済みモデル
# title_data = pd.read_csv(titlePath+'title_vec_originalLearned_300.csv', index_col=0)#自作辞書モデル
title_data = pd.read_csv(allPath+'title_vec_originalLearned_200.csv', index_col=0)#自作辞書モデル
data = pd.merge(data, title_data, left_index=True, right_index=True, sort=False, how='inner')

#keywordデータをベクトルに変換したものを追加
# key_data = pd.read_csv(keyPath+'key_vec_learnedModel.csv', index_col=0)#学習済みモデル
# key_data = pd.read_csv(keyPath+'key_vec_originalModel.csv', index_col=0)#自作辞書モデル
key_data = pd.read_csv(allPath+'key_vec_originalLearned_200.csv', index_col=0)#自作辞書モデル
data = pd.merge(data, key_data, left_index=True, right_index=True, sort=False, how='inner')

#storyデータをベクトルに変換したものを追加
# story_data = pd.read_csv(storyPath+'vec_learnedModel.csv', index_col=0)#学習済みモデル
story_data = pd.read_csv(allPath+'story_vec_originalLearned_200.csv', index_col=0)#自作辞書モデル
data = pd.merge(data, story_data, left_index=True, right_index=True, sort=False, how='inner')

#dataをtest, trainに分割
train = data[:len(train)]
test = data[len(train):]

print("len(data):", len(data))


len(data): 48522
len(data): 48522


### アンダーサンプリング

In [41]:
#一時的に欠損値を含むものを削除
# train = train.dropna(how='any')

train0 = train[train['fav_novel_cnt_bin']==0]
train1 = train[train['fav_novel_cnt_bin']==1]
train2 = train[train['fav_novel_cnt_bin']==2]
train3 = train[train['fav_novel_cnt_bin']==3]
train4 = train[train['fav_novel_cnt_bin']==4]


#todo train list 等に突っ込んでアンダーサンプリングでloggingする
num = len(train4)
train = pd.concat([train0[:num], train1[:num], train2[:num], train3[:num], train4[:num]], sort=False)#データの統合
print(len(train))

4070


### オーバーサンプリング

#### SMOTE

In [23]:
#data分割
train = train.dropna(how='any')
x_train = train.drop('fav_novel_cnt_bin', axis=1)#訓練データ
y_train = train['fav_novel_cnt_bin']#訓練データの答え

from imblearn.over_sampling import SMOTE
sm = SMOTE()
x_resampled, y_resampled = sm.fit_resample(x_train, y_train)
x_train = x_resampled
y_train = y_resampled

In [8]:
train0 = train[train['fav_novel_cnt_bin']==0]
train1 = train[train['fav_novel_cnt_bin']==1]
train2 = train[train['fav_novel_cnt_bin']==2]
train3 = train[train['fav_novel_cnt_bin']==3]
train4 = train[train['fav_novel_cnt_bin']==4]

num_train0 = len(train0)
train = pd.concat(
  [
    train0.sample(n=num_train0, replace=True),
    train1.sample(n=num_train0, replace=True),
    train2.sample(n=num_train0, replace=True),
    train3.sample(n=num_train0, replace=True),
    train4.sample(n=num_train0, replace=True),
  ],
  sort=False
)

### 訓練データと答えに分割

In [22]:
#一時的に欠損値を含むものを削除
# train = train.dropna(how='any')

#data分割
x_train = train.drop('fav_novel_cnt_bin', axis=1)#訓練データ
y_train = train['fav_novel_cnt_bin']#訓練データの答え

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

### 訓練データ

In [9]:
#一時的に欠損値を含むものを削除
# train = train.dropna(how='any')

#data分割
x_train = train.drop('fav_novel_cnt_bin', axis=1)#訓練データ
y_train = train['fav_novel_cnt_bin']#訓練データの答え

## 機械学習


### 勾配ブースティング

In [None]:
import lightgbm as lgb

# モデルの学習
model = lgb.LGBMClassifier() # モデルのインスタンスの作成
model.fit(x_train, y_train) # モデルの学習

# テストデータの予測クラス (予測クラス(0 or 1 or 2)を返す)
y_pred = model.predict(x_valid)

# テストデータのクラス予測確率 (各クラスの予測確率 [クラス0の予測確率,クラス1の予測確率,クラス2の予測確率] を返す)
y_pred_prob = model.predict_proba(x_valid)

### ランダムフォレスト

In [24]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)
# y_pred = randomforest.predict(x_valid)
# y_pred_prob = randomforest.predict_proba(x_valid)

RandomForestClassifier()

### ロジスティック回帰

In [26]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
pred = logreg.predict(x_valid)
y_pred = logreg.predict(x_valid)
y_pred_prob = logreg.predict_proba(x_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


NameError: name 'x_valid' is not defined

In [24]:
#正答率の確認
from sklearn.metrics import accuracy_score
def calc_score(y_true, y_pred):
  tmp = []
  for i in range(len(y_true)):
    tmp.append([0,0,0,0,0])
    tmp[i][int(y_true[i])] = 1
  tmp = np.array(tmp)
  #確率が極端な場合クリップ（損失が大きくなりすぎるため）
  y_pred = np.where(y_pred<=0.001, 0.001, y_pred)
  y_pred = np.where(y_pred>=0.999, 0.999, y_pred)
  return -np.sum(tmp * np.log(y_pred)) / len(y_true)

print('accuracy = ', accuracy_score(y_true=y_valid.values, y_pred=y_pred))
print('score = ', calc_score(y_true=y_valid.values, y_pred=y_pred_prob))

accuracy =  0.9363223568951339
score =  0.3472765180111649


### submission用に変換

In [28]:

model = logreg

# test = test.drop('fav_novel_cnt_bin', axis=1)
y_pred = model.predict(test)
y_pred_prob = model.predict_proba(test)

#確率が極端な場合クリップ（損失が大きくなりすぎるため）
y_pred_prob = np.where(y_pred_prob<=0.001, 0.001, y_pred_prob)
y_pred_prob = np.where(y_pred_prob>=0.999, 0.999, y_pred_prob)

predData = pd.DataFrame(y_pred_prob)
sub = pd.read_csv(dataPath +'sample_submission.csv')
sub['proba_0'] = predData[0]
sub['proba_1'] = predData[1]
sub['proba_2'] = predData[2]
sub['proba_3'] = predData[3]
sub['proba_4'] = predData[4]

import datetime
now = datetime.datetime.now()
filename = str(now.year)+'-'+str(now.month)+'-'+ str(now.day) +'-'+ str(now.hour)+'-'+str(now.minute)
sub.to_csv('./data/sub/'+ filename +'.csv', index=False)


Feature names unseen at fit time:
- fav_novel_cnt_bin
Feature names must be in the same order as they were in fit.



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').