In [1]:
import pickle

import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
df = pd.read_csv('data/newsCorpora.csv', sep='\t', header=None)
columns = [
    'id',
    'title',
    'url',
    'publisher',
    'category',
    'story',
    'hostname',
    'timestamp'
]
df.columns = columns

In [19]:
wordset = df.title.apply(lambda x: x.replace('.', '').replace(',', '').replace('!', '').replace('?', '').split())
vectors = []
for words in tqdm(wordset):
    total = np.zeros(300)
    n = 0
    for word in words:
        if word in model.wv:
            total += model.wv[word]
            n+=1
    vectors.append((total/n).tolist())
vectors = np.array(vectors)
vectors.shape

100%|██████████| 422419/422419 [01:19<00:00, 5283.83it/s]


(422419, 300)

In [20]:
label_map = {'b': 0, 't': 1, 'e': 2, 'm': 3}
y = df.category.apply(lambda x: label_map[x]).values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(vectors, y, train_size=0.8, random_state=42, shuffle=True)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=42, shuffle=True)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)

(337935, 300)
(42242, 300)
(42242, 300)


In [23]:
def save_vectors(filename: str, data: tuple):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

In [24]:
save_vectors('data/train.pkl', (X_train, y_train))
save_vectors('data/valid.pkl', (X_valid, y_valid))
save_vectors('data/test.pkl', (X_test, y_test))