In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from unidecode import unidecode
import re
import csv 
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from tqdm import tqdm 
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
torch.set_flush_denormal(True)

True

In [2]:
word2vec = Word2Vec.load("SG_300_25_20/SG_300_25_20.model").wv

In [3]:
html_tag = r"<\S*>"
invalid_characters = r"[^a-zA-Z0-9 ]"
percentage = r"[0-9]+%"
hashtag = r"#\S*"
numeric = r"[0-9]+"
at = r"@\S*"
address = r"\S+@\S+\.\S+"
link = r"(https?:\/\/|www.)\S+"

def preprocess(s):
    # Remove html tags.
    s = re.sub(html_tag, " ", s)

    # Substitute percents.
    s = re.sub(percentage, " procent ", s)

    # Substitute hashtags.
    s = re.sub(hashtag, " hashtag ", s)

    # Substitute at.
    s = re.sub(at, " entitate ", s)

    # Substitute numbers. 
    s = re.sub(numeric, " numar ", s)

    # Substitute addresses.
    s = re.sub(address, " adresa ", s)

    # Substitute links.
    s = re.sub(link, " link ", s)

    # Remove accents, diacritics.
    s = unidecode(s)

    # Keep only these characters
    s = re.sub(invalid_characters, " ", s)
    return s

In [4]:
romainian_stopwords = set(stopwords.words("romanian"))

def tokenize(s):
    tokens = s.split()
    tokens = list(filter(lambda x: x not in romainian_stopwords, tokens))
    tokens = [s.lower() for s in tokens]
    return tokens

In [5]:
# Converts a list of tokens to a vector.
# Only look at the first num_tokens tokens.
def vectorize(tokens, num_tokens=2):
    vectors = []
    for i in range(num_tokens):
        if i >= len(tokens):
            vectors.append(torch.zeros(word2vec.vector_size))
        else:
            if tokens[i] in word2vec:
                vectors.append(torch.tensor(word2vec[tokens[i]])) 
            else:
                vectors.append(torch.zeros(word2vec.vector_size))
    return torch.cat(vectors, dim=0)

In [6]:
def load_train():
    df = pd.read_csv('train.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=15) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=35) for x in tqdm(df["title"])]
    labels = list(df["class"])

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x , labels 

In [7]:
x, labels = load_train() 
train_x, test_x, train_labels, test_labels = train_test_split(x, labels, test_size=0.2, shuffle=True)

100%|██████████| 70575/70575 [01:07<00:00, 1053.07it/s]
100%|██████████| 70575/70575 [00:15<00:00, 4425.15it/s]


In [8]:
class MyDataset(Dataset):
    def __init__(self, x, labels):
        self.x = x
        self.labels = labels
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], 1 if self.labels[index] == True else 0

NUM_FEATURES = train_x[0].shape[0]
train_dataset = MyDataset(train_x, train_labels)
test_dataset = MyDataset(test_x, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [37]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(NUM_FEATURES, 1000)
        self.layer2 = nn.Linear(1000, 500)
        self.layer3 = nn.Linear(500, 250)
        self.layer4 = nn.Linear(250, 2)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()

    def forward(self,x):
        x = self.layer1(x)
        x = self.relu(x)

        x = self.layer2(x)
        x = self.relu(x)

        x = self.layer3(x)
        x = self.relu(x)

        x = self.layer4(x)
        x = self.softmax(x)

        return x

    def predict(self, x):
        x = self.forward(x)
        return torch.argmax(x, dim=1)

In [38]:
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor([3, 1.5]))
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

In [39]:
def train_one_epoch():
    model.train(True)
    running_loss = 0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_dataloader)):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

        if i % 40 == 39:
            print(f"Running loss: {running_loss/(i+1)}") 

    return running_loss/(i+1)

In [40]:
def test_model():
    model.eval()

    predictions = []
    targets = []

    with torch.no_grad():
        running_vloss = 0.0

        for i, vdata in enumerate(test_dataloader):
            x , labels = vdata
            outputs = model(x)

            vloss = loss_fn(outputs, labels)
            running_vloss += vloss.item()

            current_predictions = model.predict(x) 
            predictions+=current_predictions
            targets += labels

        avg_vloss = running_vloss / (i + 1)

        return avg_vloss, balanced_accuracy_score(targets, predictions) 

In [41]:
EPOCHS = 30 

for epoch in range(EPOCHS):
    train_loss = train_one_epoch()
    test_loss, test_accuracy = test_model()

    print(f"Train loss: {train_loss}, Test loss: {test_loss}, Test accuracy: {test_accuracy}")


41it [00:09,  4.71it/s]

Running loss: 0.45243761539459226


80it [00:18,  4.39it/s]

Running loss: 0.42866955809295176


120it [00:27,  4.43it/s]

Running loss: 0.42097761953870455


160it [00:37,  4.59it/s]

Running loss: 0.4144982848316431


200it [00:47,  3.95it/s]

Running loss: 0.4105743809044361


221it [00:52,  4.25it/s]


Train loss: 0.4085908064205722, Test loss: 0.38788789404290064, Test accuracy: 0.927612157523148


40it [00:09,  3.90it/s]

Running loss: 0.39092663675546646


80it [00:18,  4.22it/s]

Running loss: 0.39131847210228443


120it [00:27,  4.11it/s]

Running loss: 0.39025087282061577


160it [00:36,  4.00it/s]

Running loss: 0.38941310383379457


200it [00:45,  4.18it/s]

Running loss: 0.38871734604239466


221it [00:50,  4.36it/s]


Train loss: 0.38903804696523225, Test loss: 0.38458689408642904, Test accuracy: 0.9331278678452894


40it [00:08,  4.16it/s]

Running loss: 0.38120540976524353


80it [00:16,  4.61it/s]

Running loss: 0.38063376806676386


121it [00:25,  4.99it/s]

Running loss: 0.3827128134667873


160it [00:34,  4.08it/s]

Running loss: 0.38354755751788616


201it [00:42,  4.85it/s]

Running loss: 0.3835513505339623


221it [00:47,  4.67it/s]


Train loss: 0.38354611423759977, Test loss: 0.38274249487689566, Test accuracy: 0.9324382035521837


40it [00:09,  4.32it/s]

Running loss: 0.38102841973304746


80it [00:19,  4.24it/s]

Running loss: 0.3804218631237745


120it [00:28,  4.68it/s]

Running loss: 0.3814279409746329


160it [00:38,  4.26it/s]

Running loss: 0.38134764414280653


200it [00:47,  4.01it/s]

Running loss: 0.3815464535355568


221it [00:51,  4.25it/s]


Train loss: 0.3813404830602499, Test loss: 0.37953078746795654, Test accuracy: 0.9389273592456981


40it [00:09,  4.11it/s]

Running loss: 0.3743742898106575


80it [00:19,  3.16it/s]

Running loss: 0.37763759717345236


120it [00:30,  3.72it/s]

Running loss: 0.38035697489976883


160it [00:41,  4.09it/s]

Running loss: 0.3799383658915758


200it [00:52,  3.21it/s]

Running loss: 0.3800319238007069


221it [00:58,  3.79it/s]


Train loss: 0.37975167513433083, Test loss: 0.3769672619444983, Test accuracy: 0.9399086937899704


40it [00:09,  4.58it/s]

Running loss: 0.3751106470823288


80it [00:19,  3.59it/s]

Running loss: 0.37733807824552057


120it [00:29,  4.83it/s]

Running loss: 0.3777344216903051


160it [00:39,  4.51it/s]

Running loss: 0.37852915097028017


200it [00:49,  5.15it/s]

Running loss: 0.37927173525094987


221it [00:54,  4.06it/s]


Train loss: 0.37903577102794905, Test loss: 0.3771505834800856, Test accuracy: 0.9408191567187797


41it [00:09,  5.13it/s]

Running loss: 0.37116425260901453


81it [00:18,  5.71it/s]

Running loss: 0.37049820721149446


121it [00:26,  5.37it/s]

Running loss: 0.3715819130341212


160it [00:34,  4.38it/s]

Running loss: 0.3734838733449578


200it [00:44,  4.60it/s]

Running loss: 0.3746686793863773


221it [00:49,  4.44it/s]


Train loss: 0.3752621063010185, Test loss: 0.37541179625051363, Test accuracy: 0.9430357851573298


40it [00:10,  3.70it/s]

Running loss: 0.3704753123223782


60it [00:15,  3.95it/s]


KeyboardInterrupt: 

In [14]:
print(test_model())

(0.3623074335711343, 0.9516841518235644)


In [18]:
def load_test():
    df = pd.read_csv('test.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=15) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=35) for x in tqdm(df["title"])]

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x 

In [19]:
x_test = load_test() 

100%|██████████| 36669/36669 [00:24<00:00, 1492.63it/s]
100%|██████████| 36669/36669 [00:09<00:00, 3774.50it/s]


In [20]:

ids = []
predictions = []

for i in tqdm(range(len(x_test))):
    id = i
    prediction = model.predict(torch.unsqueeze(x_test[i], 0)).item()

    ids.append(id)
    predictions.append(prediction)


df = pd.DataFrame(data={"id":ids, "class": predictions})
print(df)

100%|██████████| 36669/36669 [02:55<00:00, 209.24it/s]


          id  class
0          0      0
1          1      1
2          2      0
3          3      0
4          4      1
...      ...    ...
36664  36664      0
36665  36665      0
36666  36666      0
36667  36667      1
36668  36668      0

[36669 rows x 2 columns]


In [21]:
df.to_csv("3.csv", index=False)