In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from unidecode import unidecode
import re
import csv 
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from tqdm import tqdm 
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

In [2]:
word2vec = Word2Vec.load("SG_300_25_20/SG_300_25_20.model").wv

In [3]:
html_tag = r"<\S*>"
invalid_characters = r"[^a-zA-Z0-9 ]"
percentage = r"[0-9]+%"
hashtag = r"#\S*"
numeric = r"[0-9]+"
at = r"@\S*"
address = r"\S+@\S+\.\S+"
link = r"(https?:\/\/|www.)\S+"

def preprocess(s):
    # Remove html tags.
    s = re.sub(html_tag, " ", s)

    # Substitute percents.
    s = re.sub(percentage, " procent ", s)

    # Substitute hashtags.
    s = re.sub(hashtag, " hashtag ", s)

    # Substitute at.
    s = re.sub(at, " entitate ", s)

    # Substitute numbers. 
    s = re.sub(numeric, " numar ", s)

    # Substitute addresses.
    s = re.sub(address, " adresa ", s)

    # Substitute links.
    s = re.sub(link, " link ", s)

    # Remove accents, diacritics.
    s = unidecode(s)

    # Keep only these characters
    s = re.sub(invalid_characters, " ", s)
    return s

In [4]:
romainian_stopwords = set(stopwords.words("romanian"))

def tokenize(s):
    tokens = s.split()
    tokens = list(filter(lambda x: x not in romainian_stopwords, tokens))
    tokens = [s.lower() for s in tokens]
    return tokens

In [5]:
# Converts a list of tokens to a vector.
# Only look at the first num_tokens tokens.
def vectorize(tokens, num_tokens=2):
    vectors = []
    for i in range(num_tokens):
        if i >= len(tokens):
            vectors.append(torch.zeros(word2vec.vector_size))
        else:
            if tokens[i] in word2vec:
                vectors.append(torch.tensor(word2vec[tokens[i]])) 
            else:
                vectors.append(torch.zeros(word2vec.vector_size))
    return torch.cat(vectors, dim=0)

In [39]:
def load_train():
    df = pd.read_csv('train.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=10) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=20) for x in tqdm(df["title"])]
    labels = list(df["class"])

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x , labels 

In [40]:
x, labels = load_train() 
train_x, test_x, train_labels, test_labels = train_test_split(x, labels, test_size=0.2, shuffle=True)

100%|██████████| 70575/70575 [01:25<00:00, 828.52it/s] 
100%|██████████| 70575/70575 [00:15<00:00, 4587.85it/s]


In [70]:
class MyDataset(Dataset):
    def __init__(self, x, labels):
        self.x = x
        self.labels = labels
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], 1 if self.labels[index] == True else 0

NUM_FEATURES = train_x[0].shape[0]
train_dataset = MyDataset(train_x, train_labels)
test_dataset = MyDataset(test_x, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [108]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(NUM_FEATURES, 1000)
        self.layer2 = nn.Linear(1000, 500)
        self.layer3 = nn.Linear(500, 2)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()

    def forward(self,x):
        x = self.layer1(x)
        x = self.relu(x)

        x = self.layer2(x)
        x = self.relu(x)

        x = self.layer3(x)
        x = self.softmax(x)

        return x

    def predict(self, x):
        x = self.forward(x)
        return torch.argmax(x, dim=1)

In [109]:
loss_fn = torch.nn.CrossEntropyLoss()
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [110]:
def train_one_epoch():
    running_loss = 0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_dataloader)):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        last_loss = loss.item()
        running_loss += last_loss

        if i % 40 == 39:
            print(f"Running loss: {running_loss/(i+1)}") 

    return last_loss

In [112]:
def test_model():
    model.eval()

    running_vloss = 0.0

    correct = 0
    total = 0

    with torch.no_grad():
        for i, vdata in enumerate(test_dataloader):
            x , labels = vdata
            outputs = model(x)

            vloss = loss_fn(outputs, labels)
            running_vloss += vloss

            predictions = model.predict(x) 
            correct += torch.sum(predictions == labels)
            total += len(labels)

    avg_vloss = running_vloss / (i + 1)

    return avg_vloss, correct/total

In [113]:
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train(True)
    train_loss = train_one_epoch()
    test_loss, test_accuracy = test_model()

    print(f"Train loss: {train_loss}, Test loss: {test_loss}, Test accuracy: {test_accuracy}")


41it [00:06,  6.00it/s]

Running loss: 0.41797221079468727


80it [00:13,  6.12it/s]

Running loss: 0.40581887178123


121it [00:20,  5.72it/s]

Running loss: 0.3994247309863567


161it [00:28,  5.53it/s]

Running loss: 0.39453000742942096


201it [00:35,  5.66it/s]

Running loss: 0.39151701793074606


221it [00:38,  5.71it/s]


Train loss: 0.3729833960533142, Test loss: 0.3874928653240204, Test accuracy: 0.9241940975189209


41it [00:07,  5.34it/s]

Running loss: 0.369513414055109


81it [00:14,  5.66it/s]

Running loss: 0.3679558251053095


121it [00:21,  5.84it/s]

Running loss: 0.36746755813558896


161it [00:28,  6.07it/s]

Running loss: 0.36616242192685605


201it [00:35,  5.66it/s]

Running loss: 0.36599480122327804


221it [00:39,  5.64it/s]


Train loss: 0.3616737425327301, Test loss: 0.3788818418979645, Test accuracy: 0.9325540065765381


41it [00:07,  5.67it/s]

Running loss: 0.3527777068316936


81it [00:14,  6.03it/s]

Running loss: 0.35112944059073925


121it [00:21,  5.70it/s]

Running loss: 0.35110909764965376


161it [00:28,  5.37it/s]

Running loss: 0.3521344779059291


201it [00:35,  5.43it/s]

Running loss: 0.3532946555316448


221it [00:38,  5.68it/s]


Train loss: 0.3522753119468689, Test loss: 0.36761027574539185, Test accuracy: 0.942472517490387


41it [00:07,  5.56it/s]

Running loss: 0.3459581099450588


80it [00:14,  5.56it/s]

Running loss: 0.34636778235435484


120it [00:22,  5.67it/s]

Running loss: 0.34733425204952556


161it [00:30,  5.58it/s]

Running loss: 0.3471878547221422


201it [00:37,  5.65it/s]

Running loss: 0.3472215610742569


221it [00:41,  5.38it/s]


Train loss: 0.36410456895828247, Test loss: 0.3654780685901642, Test accuracy: 0.945448100566864


41it [00:07,  5.32it/s]

Running loss: 0.3402584508061409


80it [00:14,  5.42it/s]

Running loss: 0.34148963876068594


121it [00:22,  5.38it/s]

Running loss: 0.34245903690656027


161it [00:29,  5.55it/s]

Running loss: 0.3419307816773653


201it [00:37,  5.13it/s]

Running loss: 0.3430040641129017


221it [00:40,  5.43it/s]


Train loss: 0.32656827569007874, Test loss: 0.36484000086784363, Test accuracy: 0.9471484422683716


In [114]:
def load_test():
    df = pd.read_csv('test.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=10) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=20) for x in tqdm(df["title"])]

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x 

In [115]:
x_test = load_test() 

100%|██████████| 36669/36669 [00:29<00:00, 1244.31it/s]
100%|██████████| 36669/36669 [00:11<00:00, 3303.63it/s]


In [122]:

ids = []
predictions = []

for i in tqdm(range(len(x_test))):
    id = i
    prediction = model.predict(torch.unsqueeze(x_test[i], 0)).item()

    ids.append(id)
    predictions.append(prediction)


df = pd.DataFrame(data={"id":ids, "class": predictions})
print(df)

100%|██████████| 36669/36669 [02:26<00:00, 249.71it/s]


          id  class
0          0      0
1          1      0
2          2      0
3          3      1
4          4      1
...      ...    ...
36664  36664      0
36665  36665      0
36666  36666      0
36667  36667      0
36668  36668      0

[36669 rows x 2 columns]


In [123]:
df.to_csv("1.csv", index=False)