In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import spacy

# doc2vecを使うためのライブラリ
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
# データの読み込み
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

submission_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')


# データの確認
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df.head(2).full_text.values

In [None]:
train_df.head(2).score.values

In [None]:
#シンプルにDec2Vecを使ってみる
# データの前処理
nlp = spacy.load('en_core_web_sm')

# テキストの前処理
def preprocess_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop]


In [None]:
# テキストの前処理

train_df['full_text_'] = train_df['full_text'].apply(preprocess_text)

In [None]:
train_df.tail(2)

In [None]:
full_text_series = train_df['full_text_']

full_text_series[:2]

In [None]:
tagged_data = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(full_text_series)]

In [None]:
len(tagged_data) == len(train_df)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# モデルの学習
model = Doc2Vec(vector_size=300, window=5, min_count=1, workers=4)


In [None]:

model.build_vocab(tagged_data)


In [None]:

model.train(tagged_data, total_examples=model.corpus_count, epochs=100)

In [None]:
# ベクトルの取得

vectors_ = [model.infer_vector(doc) for doc in full_text_series]

vectors_[:2]

In [None]:

vectors = np.array(vectors_)
vectors[:2]

In [None]:

vectors.shape

In [None]:
# データの分割

X = vectors

y = train_df['score']


In [None]:
# yの値が１～６なので、LihgtGBMのクラスに合わせて、０～５に並べ替えるために書き換える

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_ = le.fit_transform(y)

In [None]:
print(y_.min())
print(y_.max())

In [None]:
print(X.shape)
print(y_.shape)

In [None]:
y_.dtype

In [None]:

X_train, X_valid, y_train, y_valid = train_test_split(X, y_, test_size=0.2, random_state=0)

# y_　は０～５の６クラス

In [None]:
# モデルの学習

lgb_train = lgb.Dataset(X_train, y_train)

lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'objective': 'multiclass', # 多クラス分類
    'num_class': 6, # クラスの数
    'metric': 'multi_logloss' # 損失関数にmulti_loglossを使用
}

lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=1000,
                     #  early_stopping_rounds=10
                     )


In [None]:
# モデルの評価

y_pred = lgb_model.predict(X_valid)

y_pred


In [None]:
# 予測値をクラスに変換
y_pred = np.argmax(y_pred, axis=1)

mean_squared_error(y_valid, y_pred)

In [None]:
test_df['full_text'] = test_df['full_text'].apply(preprocess_text)

In [None]:
test_full_text_series = test_df['full_text']

test_full_text_series[:3]

In [None]:
tagged_data_test = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(test_full_text_series)]

In [None]:
model.train(tagged_data_test, total_examples=model.corpus_count, epochs=100)

In [None]:
# ベクトルの取得

vectors_test_ = [model.infer_vector(doc) for doc in test_full_text_series]


In [None]:
vectors_test = np.array(vectors_test_)

In [None]:
vectors_test.shape

In [None]:
prediction = lgb_model.predict(vectors_test)

In [None]:
print(prediction.max(), prediction.min())

In [None]:
# inversする
predict = np.argmax(prediction, axis=1)
predict

In [None]:
# ラベルをinversする
Y_pre =le.inverse_transform(predict)
Y_pre


In [None]:
submission_df['score'] =Y_pre
submission_df

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)