In [1]:
import pandas as pd

data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

In [2]:
import os
from dotenv import load_dotenv
from gensim.models import KeyedVectors

load_dotenv()
FILE_DIR = os.getenv('FILE_DIR')

model = KeyedVectors.load_word2vec_format(f"{FILE_DIR}/GoogleNews-vectors-negative300.bin", binary=True)

In [3]:
def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1

In [4]:
import re
import spacy
import numpy as np

nlp = spacy.load("en_core_web_sm", disable=("ner", "parser"))

def get_feature(text: str):
    doc = nlp(text)
    word_vectors = []

    for token in doc:
      try:
        word = token.text.replace("\n", "")

        code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
        word = code_regex.sub('', word)
        if len(word.replace(' ', '')) == 0:
          continue

        vector = model.get_vector(word)
        word_vectors.append(vector)

      except:
        continue

      word_vectors_np = np.array(word_vectors)
      word_vectors_avg = np.average(word_vectors_np, axis=0).tolist()

    return word_vectors_avg

In [5]:
import torch

train_X = torch.tensor([get_feature(title) for title in train["TITLE"]])
valid_X = torch.tensor([get_feature(title) for title in valid["TITLE"]])
test_X = torch.tensor([get_feature(title) for title in test["TITLE"]])

torch.save(train_X, "./train_X.pt")
torch.save(valid_X, "./valid_X.pt")
torch.save(test_X, "./test_X.pt")

In [6]:
train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

torch.save(train_Y, "./train_Y.pt")
torch.save(valid_Y, "./valid_Y.pt")
torch.save(test_Y, "./test_Y.pt")