In [None]:
# 単語ベクトルのロード
from gensim.models import KeyedVectors

file = 'GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(file, binary=True)

# データのロード
import pandas as pd
import re
import numpy as np

# ファイル読み込み
file = 'newsCorpora.csv'
data = pd.read_csv(file, encoding='utf-8', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
data = data.replace('"', "'")
# 特定のpublisherのみ抽出
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
data = data.loc[data['PUBLISHER'].isin(publishers), ['TITLE', 'CATEGORY']].reset_index(drop=True)

# 学習用、検証用、評価用に分割する
from sklearn.model_selection import train_test_split

train, valid_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=64, stratify=data['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=64, stratify=valid_test['CATEGORY'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

# データ数の確認
print('学習データ')
print(train['CATEGORY'].value_counts())
print('検証データ')
print(valid['CATEGORY'].value_counts())
print('評価データ')
print(test['CATEGORY'].value_counts())

import re
from nltk import stem

# データの結合
df = pd.concat([train, valid, test], axis=0).reset_index(drop=True)

# 前処理
def preprocessing(text):
    text_clean = re.sub(r'[\"\'.,:;\(\)#\|\*\+\!\?#$%&/\]\[\{\}]', '', text)
    text_clean = re.sub('[0-9]+', '0', text_clean)
    text_clean = re.sub('\s-\s', ' ', text_clean)
    return text_clean

df['TITLE'] = df['TITLE'].apply(preprocessing)


In [None]:
import numpy as np
# 平均単語ベクトルの取得
def w2v(text):
    words = text.rstrip().split()
    vec = [model[word] for word in words if word in model]
    return np.array(sum(vec) / len(vec))

vecs = np.array([])
for text in df['TITLE']:
    if len(vecs) == 0:
        vecs = w2v(text)
    else:
        vecs = np.vstack([vecs, w2v(text)])

# 特徴ベクトルのテンソル化
import torch

# 乱数のシードを設定
torch.manual_seed(1234)
np.random.seed(1234)

X_train = torch.from_numpy(vecs[:len(train), :])
X_valid = torch.from_numpy(vecs[len(train):len(train)+ len(valid), :])
X_test = torch.from_numpy(vecs[len(train)+ len(valid):, :])
print(X_train.size())
print(X_train)