In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from unidecode import unidecode
import re
import csv 
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from tqdm import tqdm 
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

  "class": algorithms.Blowfish,


In [2]:
word2vec = Word2Vec.load("SG_300_25_20/SG_300_25_20.model").wv

In [4]:
html_tag = r"<\S*>"
invalid_characters = r"[^a-zA-Z0-9 ]"
percentage = r"[0-9]+%"
hashtag = r"#\S*"
numeric = r"[0-9]+"
at = r"@\S*"
address = r"\S+@\S+\.\S+"
link = r"(https?:\/\/|www.)\S+"

def preprocess(s):
    # Remove html tags.
    s = re.sub(html_tag, " ", s)

    # Substitute percents.
    s = re.sub(percentage, " procent ", s)

    # Substitute hashtags.
    s = re.sub(hashtag, " hashtag ", s)

    # Substitute at.
    s = re.sub(at, " entitate ", s)

    # Substitute numbers. 
    s = re.sub(numeric, " numar ", s)

    # Substitute addresses.
    s = re.sub(address, " adresa ", s)

    # Substitute links.
    s = re.sub(link, " link ", s)

    # Remove accents, diacritics.
    s = unidecode(s)

    # Keep only these characters
    s = re.sub(invalid_characters, " ", s)
    return s

In [5]:
romainian_stopwords = set(stopwords.words("romanian"))

def tokenize(s):
    tokens = s.split()
    tokens = list(filter(lambda x: x not in romainian_stopwords, tokens))
    tokens = [s.lower() for s in tokens]
    return tokens

In [6]:
# Converts a list of tokens to a vector.
# Only look at the first num_tokens tokens.
def vectorize(tokens, num_tokens=2):
    vectors = []
    for i in range(num_tokens):
        if i >= len(tokens):
            vectors.append(torch.zeros(word2vec.vector_size))
        else:
            if tokens[i] in word2vec:
                vectors.append(torch.tensor(word2vec[tokens[i]])) 
            else:
                vectors.append(torch.zeros(word2vec.vector_size))
    return torch.cat(vectors, dim=0)

In [7]:
def load_train():
    df = pd.read_csv('train.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=10) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=20) for x in tqdm(df["title"])]
    labels = list(df["class"])

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x , labels 

In [8]:
x, labels = load_train() 
train_x, test_x, train_labels, test_labels = train_test_split(x, labels, test_size=0.2, shuffle=True)

100%|██████████| 70575/70575 [01:29<00:00, 786.21it/s] 
100%|██████████| 70575/70575 [00:19<00:00, 3613.16it/s]


In [9]:
class MyDataset(Dataset):
    def __init__(self, x, labels):
        self.x = x
        self.labels = labels
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], 1 if self.labels[index] == True else 0

NUM_FEATURES = train_x[0].shape[0]
train_dataset = MyDataset(train_x, train_labels)
test_dataset = MyDataset(test_x, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [10]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(NUM_FEATURES, 1000)
        self.layer2 = nn.Linear(1000, 500)
        self.layer3 = nn.Linear(500, 2)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()

    def forward(self,x):
        x = self.layer1(x)
        x = self.relu(x)

        x = self.layer2(x)
        x = self.relu(x)

        x = self.layer3(x)
        x = self.softmax(x)

        return x

    def predict(self, x):
        x = self.forward(x)
        return torch.argmax(x, dim=1)

In [49]:
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor([3, 1.5]))
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [50]:
def train_one_epoch():
    model.train(True)
    running_loss = 0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_dataloader)):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

        if i % 40 == 39:
            print(f"Running loss: {running_loss/(i+1)}") 

    return running_loss/(i+1)

In [51]:
def test_model():
    model.eval()

    with torch.no_grad():
        running_vloss = 0.0
        correct = 0
        total = 0

        for i, vdata in enumerate(test_dataloader):
            x , labels = vdata
            outputs = model(x)

            vloss = loss_fn(outputs, labels)
            running_vloss += vloss.item()

            predictions = model.predict(x) 
            correct += torch.sum(predictions == labels).item()
            total += len(labels)

        avg_vloss = running_vloss / (i + 1)

        return avg_vloss, correct/total

In [52]:
EPOCHS = 30 

for epoch in range(EPOCHS):
    train_loss = train_one_epoch()
    test_loss, test_accuracy = test_model()

    print(f"Train loss: {train_loss}, Test loss: {test_loss}, Test accuracy: {test_accuracy}")


41it [00:06,  6.05it/s]

Running loss: 0.442437706142664


81it [00:12,  5.88it/s]

Running loss: 0.4254419207572937


121it [00:19,  5.64it/s]

Running loss: 0.4193021170794964


161it [00:26,  5.74it/s]

Running loss: 0.4138103902339935


201it [00:33,  5.74it/s]

Running loss: 0.4107078315317631


221it [00:36,  6.03it/s]


Train loss: 0.40928648580792804, Test loss: 0.398739042026656, Test accuracy: 0.9057739992915338


41it [00:06,  5.80it/s]

Running loss: 0.3950245432555676


81it [00:13,  5.69it/s]

Running loss: 0.39597914442420007


121it [00:20,  6.84it/s]

Running loss: 0.3947863439718882


161it [00:26,  6.08it/s]

Running loss: 0.39380279537290336


201it [00:33,  5.75it/s]

Running loss: 0.3934845411777496


221it [00:37,  5.94it/s]


Train loss: 0.39291488247759204, Test loss: 0.3899833395012787, Test accuracy: 0.9402763018065887


41it [00:06,  6.65it/s]

Running loss: 0.38918133452534676


81it [00:11,  6.87it/s]

Running loss: 0.38819612823426725


121it [00:18,  6.07it/s]

Running loss: 0.3871065557003021


161it [00:24,  6.03it/s]

Running loss: 0.3878294860944152


201it [00:30,  7.33it/s]

Running loss: 0.3883435048162937


221it [00:33,  6.56it/s]


Train loss: 0.38866747544901403, Test loss: 0.38565234893134664, Test accuracy: 0.9364505844845908


41it [00:06,  6.17it/s]

Running loss: 0.3855260290205479


81it [00:12,  5.93it/s]

Running loss: 0.385485402867198


121it [00:19,  6.36it/s]

Running loss: 0.3861936169366042


161it [00:25,  5.79it/s]

Running loss: 0.38598664868623017


201it [00:32,  5.59it/s]

Running loss: 0.386503369063139


221it [00:36,  6.10it/s]


Train loss: 0.3865070751619555, Test loss: 0.3837799461824553, Test accuracy: 0.9271696776478923


41it [00:06,  6.08it/s]

Running loss: 0.38300078064203263


81it [00:13,  6.43it/s]

Running loss: 0.3845754534006119


121it [00:19,  6.34it/s]

Running loss: 0.38450175200899445


161it [00:26,  6.75it/s]

Running loss: 0.38462545704096557


201it [00:32,  6.67it/s]

Running loss: 0.3847883348166943


221it [00:35,  6.17it/s]


Train loss: 0.38447884979291197, Test loss: 0.3879996080483709, Test accuracy: 0.9180304640453418


41it [00:07,  5.63it/s]

Running loss: 0.3797962382435799


81it [00:13,  6.68it/s]

Running loss: 0.3816814560443163


121it [00:20,  5.49it/s]

Running loss: 0.38321536804238954


161it [00:27,  7.06it/s]

Running loss: 0.3834897933527827


201it [00:33,  5.85it/s]

Running loss: 0.3834537762403488


221it [00:36,  6.00it/s]


Train loss: 0.38325372243898487, Test loss: 0.3894762567111424, Test accuracy: 0.9148423662770103


41it [00:07,  5.65it/s]

Running loss: 0.3750345379114151


81it [00:14,  5.43it/s]

Running loss: 0.37661272697150705


121it [00:21,  5.54it/s]

Running loss: 0.37997425223390263


161it [00:28,  6.48it/s]

Running loss: 0.38040483836084604


201it [00:35,  5.47it/s]

Running loss: 0.381878632158041


221it [00:39,  5.65it/s]


Train loss: 0.3823936663871437, Test loss: 0.39676295114415033, Test accuracy: 0.9365922777187389


41it [00:07,  5.24it/s]

Running loss: 0.38095813915133475


81it [00:14,  5.00it/s]

Running loss: 0.38011951372027397


120it [00:21,  5.33it/s]

Running loss: 0.38005553459127744


161it [00:29,  5.90it/s]

Running loss: 0.38039728701114656


201it [00:37,  5.34it/s]

Running loss: 0.38113179966807365


221it [00:41,  5.38it/s]


Train loss: 0.3812784723836372, Test loss: 0.38153374195098877, Test accuracy: 0.9377258235919235


40it [00:08,  4.56it/s]

Running loss: 0.37740418687462807


80it [00:16,  5.03it/s]

Running loss: 0.37835634462535384


120it [00:25,  4.71it/s]

Running loss: 0.37857528030872345


160it [00:36,  3.14it/s]

Running loss: 0.3794817356392741


200it [00:47,  3.64it/s]

Running loss: 0.3798303383588791


221it [00:53,  4.16it/s]


Train loss: 0.3810104932720305, Test loss: 0.38498877148543087, Test accuracy: 0.9215727948990435


40it [00:12,  3.63it/s]

Running loss: 0.3773673504590988


80it [00:24,  3.52it/s]

Running loss: 0.3779719527810812


120it [00:36,  3.42it/s]

Running loss: 0.3794906072318554


150it [00:46,  3.25it/s]


KeyboardInterrupt: 

In [48]:
print(test_model())

(0.36253542612705913, 0.9489904357066951)


In [40]:
def load_test():
    df = pd.read_csv('test.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=10) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=20) for x in tqdm(df["title"])]

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x 

In [41]:
x_test = load_test() 

100%|██████████| 36669/36669 [00:24<00:00, 1495.78it/s]
100%|██████████| 36669/36669 [00:11<00:00, 3088.49it/s]


In [42]:

ids = []
predictions = []

for i in tqdm(range(len(x_test))):
    id = i
    prediction = model.predict(torch.unsqueeze(x_test[i], 0)).item()

    ids.append(id)
    predictions.append(prediction)


df = pd.DataFrame(data={"id":ids, "class": predictions})
print(df)

100%|██████████| 36669/36669 [02:30<00:00, 243.12it/s]


          id  class
0          0      0
1          1      0
2          2      0
3          3      0
4          4      0
...      ...    ...
36664  36664      0
36665  36665      0
36666  36666      0
36667  36667      1
36668  36668      0

[36669 rows x 2 columns]


In [43]:
df.to_csv("1.csv", index=False)