In [11]:
from gensim.models import KeyedVectors

file = '/content/drive/MyDrive/Colab Notebooks/chapter07/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(file, binary=True)

In [12]:
#train,test,validの読み込み　
import pandas as pd
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter08/train.txt', sep="\t", header=0,names=["TITLE","CATEGORY"])
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter08/test.txt', sep="\t", header=0,names=["TITLE","CATEGORY"])
valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter08/valid.txt', sep="\t", header=0,names=["TITLE","CATEGORY"])

#元のindexを削除し、indexを振りなおす
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

# データ数の確認
print('学習データ')
print(train['CATEGORY'].value_counts())
print('検証データ')
print(valid['CATEGORY'].value_counts())
print('評価データ')
print(test['CATEGORY'].value_counts())


学習データ
CATEGORY
b    4502
e    4223
t    1219
m     728
Name: count, dtype: int64
検証データ
CATEGORY
b    562
e    528
t    153
m     91
Name: count, dtype: int64
評価データ
CATEGORY
b    563
e    528
t    152
m     91
Name: count, dtype: int64


In [13]:
import re
from nltk import stem

# データの連結
df = pd.concat([train, valid, test], axis=0).reset_index(drop=True)

# 前処理(データを整える)
def preprocessing(text):
    text_clean = re.sub(r'[\"\'.,:;\(\)#\|\*\+\!\?#$%&/\]\[\{\}]', '', text)
    text_clean = re.sub('[0-9]+', '0', text_clean)
    text_clean = re.sub('\s-\s', ' ', text_clean)
    return text_clean

#dfの単語に対して前処理を実行
df['TITLE'] = df['TITLE'].apply(preprocessing)

In [14]:
import numpy as np
# 平均単語ベクトルの取得
def w2v(text):
    words = text.rstrip().split()
    vec = [model[word] for word in words if word in model]
    return np.array(sum(vec) / len(vec))

vecs = np.array([])
for text in df['TITLE']:
    if len(vecs) == 0:                               #vecsが空のとき、
        vecs = w2v(text)                             #w2v実行
    else:                                            #vecsが空出ないとき、
        vecs = np.vstack([vecs, w2v(text)])          #縦方向にvecsとw2vを結合

# 特徴ベクトルのテンソル化
import torch

# 乱数のシードを設定
torch.manual_seed(1234)
np.random.seed(1234)

#各サイズのテンソルを格納
X_train = torch.from_numpy(vecs[:len(train), :])
X_valid = torch.from_numpy(vecs[len(train):len(train)+ len(valid), :])
X_test = torch.from_numpy(vecs[len(train)+ len(valid):, :])
print(X_train.size())
print(X_train)

torch.Size([10672, 300])
tensor([[ 0.0368,  0.0300, -0.0738,  ..., -0.1523,  0.0419, -0.0774],
        [ 0.0002, -0.0056, -0.0824,  ..., -0.0544,  0.0776, -0.0214],
        [ 0.0266, -0.0166, -0.0877,  ..., -0.0522,  0.0517,  0.0093],
        ...,
        [-0.0291,  0.0529, -0.1453,  ...,  0.0494,  0.1548, -0.0910],
        [-0.0269,  0.1204, -0.0289,  ..., -0.0062,  0.0739, -0.0327],
        [ 0.0361,  0.1236,  0.0260,  ..., -0.0099, -0.0193,  0.0262]])


In [16]:
# ターゲットのテンソル化
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)
Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)
Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)
# 保存
torch.save(X_train, '/content/drive/MyDrive/Colab Notebooks/chapter08/X_train.pt')
torch.save(X_valid, '/content/drive/MyDrive/Colab Notebooks/chapter08/X_valid.pt')
torch.save(X_test, '/content/drive/MyDrive/Colab Notebooks/chapter08/X_test.pt')
torch.save(Y_train, '/content/drive/MyDrive/Colab Notebooks/chapter08/y_train.pt')
torch.save(Y_valid, '/content/drive/MyDrive/Colab Notebooks/chapter08/y_valid.pt')
torch.save(Y_test, '/content/drive/MyDrive/Colab Notebooks/chapter08/y_test.pt')