In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from unidecode import unidecode
import re
import csv 
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from tqdm import tqdm 
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

In [2]:
word2vec = Word2Vec.load("SG_300_25_20/SG_300_25_20.model").wv

In [3]:
html_tag = r"<\S*>"
invalid_characters = r"[^a-zA-Z0-9 ]"
percentage = r"[0-9]+%"
hashtag = r"#\S*"
numeric = r"[0-9]+"
at = r"@\S*"
address = r"\S+@\S+\.\S+"
link = r"(https?:\/\/|www.)\S+"

def preprocess(s):
    # Remove html tags.
    s = re.sub(html_tag, " ", s)

    # Substitute percents.
    s = re.sub(percentage, " procent ", s)

    # Substitute hashtags.
    s = re.sub(hashtag, " hashtag ", s)

    # Substitute at.
    s = re.sub(at, " entitate ", s)

    # Substitute numbers. 
    s = re.sub(numeric, " numar ", s)

    # Substitute addresses.
    s = re.sub(address, " adresa ", s)

    # Substitute links.
    s = re.sub(link, " link ", s)

    # Remove accents, diacritics.
    s = unidecode(s)

    # Keep only these characters
    s = re.sub(invalid_characters, " ", s)
    return s

In [4]:
romainian_stopwords = set(stopwords.words("romanian"))

def tokenize(s):
    tokens = s.split()
    tokens = list(filter(lambda x: x not in romainian_stopwords, tokens))
    tokens = [s.lower() for s in tokens]
    return tokens

In [5]:
# Converts a list of tokens to a vector.
# Only look at the first num_tokens tokens.
def vectorize(tokens, num_tokens=2):
    vectors = []
    for i in range(num_tokens):
        if i >= len(tokens):
            vectors.append(torch.zeros(word2vec.vector_size))
        else:
            if tokens[i] in word2vec:
                vectors.append(torch.tensor(word2vec[tokens[i]])) 
            else:
                vectors.append(torch.zeros(word2vec.vector_size))
    return torch.cat(vectors, dim=0)

In [39]:
def load_train():
    df = pd.read_csv('train.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=10) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=20) for x in tqdm(df["title"])]
    labels = list(df["class"])

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x , labels 

In [40]:
x, labels = load_train() 
train_x, test_x, train_labels, test_labels = train_test_split(x, labels, test_size=0.2, shuffle=True)

100%|██████████| 70575/70575 [01:25<00:00, 828.52it/s] 
100%|██████████| 70575/70575 [00:15<00:00, 4587.85it/s]


In [42]:
class MyDataset(Dataset):
    def __init__(self, x, labels):
        self.x = x
        self.labels = labels
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], self.labels[index] 

train_dataset = MyDataset(train_x, train_labels)
test_dataset = MyDataset(test_x, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True)