## データの読み込み

In [1]:
import pandas as pd
import numpy as np

#local 読み込み
dataPath = "datasets/"
keyPath = 'data/keyworddata/'

## keywordを抽出

In [2]:
test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

data = pd.concat([train, test], sort=False)#データの統合

data['keyword'].to_csv(keyPath + 'origin.csv')

## 分散表現に変換

### 学習済みモデルの読み込み

In [2]:
import gensim
from pprint import pprint

# chiVeデータのPATH（kv:KeyedVectors）
model_path = './jalang/entity_vector.model.txt'
# モデルの読み込み
model = gensim.models.KeyedVectors.load_word2vec_format(model_path)

### 自作学習モデル

In [5]:
from gensim.models import word2vec

data = pd.read_csv(keyPath+'origin.csv', index_col=0)

#[[文章の単語], [文章の単語]]を作成
title_dic = []
def make_title_dic(text):
  if text is np.nan:
    return
  title_dic.append(text.split(' '))
data['keyword'].map(make_title_dic)

#学習
model = word2vec.Word2Vec(title_dic,vector_size=200, min_count=5, window=5, epochs=20)

#作成した辞書をcsvに保存して可視化(処理には関係ない)
from gensim import corpora#辞書を作るためのもの
dictionary = corpora.Dictionary(title_dic)#textsをもとに辞書を作成します
dictionary.filter_extremes(no_below=5)#出現文書数が5回以下のものはさようなら
dictionary.save_as_text(keyPath + 'dictionary.csv')#辞書を保存
model = model.wv
# In[4]:
print('語彙の数:', len(model.index_to_key))  # 語彙の数
print(model.index_to_key[:10])

語彙の数: 3789
['日常', 'R15', '残酷な描写あり', '男主人公', '現代', '青春', 'シリアス', '女主人公', 'ほのぼの', '近未来']


### 分散表現に変換

In [6]:
#文章の分散表現を求める(単語ベクトルの平均)
def getSentenceVector(text):
  if text is np.nan:
    return np.array([np.nan for _ in range(200)])
  L = []
  nonL = []
  for w in text.split(' '):
    if w in model.key_to_index:
      L.append(model.get_vector(key=w))
    else:
      nonL.append(w)#辞書に場合
  #辞書にないものを出力
  with open('./data/keyworddata/nonDic.txt', 'w') as f:
    for d in nonL:
      try:
        f.write("%s\n" % d)
      except Exception as e:
        print(d, e)
  if len(L)==0:
    return np.array([np.nan for _ in range(200)])
  return np.array(L).mean(axis=0)

#分散表現をnp.arrayの形に変換
def makeVectorArrayList(data):
  vec_list = []
  for index, row in data.iterrows():
    vec_list.append(getSentenceVector(row['keyword']))
  return np.array(vec_list)

data = pd.read_csv(keyPath+'origin.csv', index_col=0)
vec_list_array = makeVectorArrayList(data)
vec_data = pd.DataFrame(data=vec_list_array, dtype='float')
vec_data.to_csv(keyPath+'key_vec_originalModel.csv')


Jóvenes 'cp932' codec can't encode character '\xf3' in position 1: illegal multibyte sequence
OVL大賞7M‬ 'cp932' codec can't encode character '\u202c' in position 7: illegal multibyte sequence
集英社小説大賞２‬ 'cp932' codec can't encode character '\u202c' in position 8: illegal multibyte sequence
HJ2021‬ 'cp932' codec can't encode character '\u202c' in position 6: illegal multibyte sequence
李大根イ・デグン 'cp932' codec can't encode character '\uf9e1' in position 0: illegal multibyte sequence
李鍾彬イ・ジョンビン 'cp932' codec can't encode character '\uf9e1' in position 0: illegal multibyte sequence
OVL大賞7M‬ 'cp932' codec can't encode character '\u202c' in position 7: illegal multibyte sequence
集英社小説大賞２‬ 'cp932' codec can't encode character '\u202c' in position 8: illegal multibyte sequence
HJ2021‬ 'cp932' codec can't encode character '\u202c' in position 6: illegal multibyte sequence
龐統 'cp932' codec can't encode character '\u9f90' in position 0: illegal multibyte sequence
異世界⇆現実世界 'cp932' codec can't encode c

In [7]:
len(vec_data)

48522