In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score
from unidecode import unidecode
import re
import csv 
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from tqdm import tqdm 
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
torch.set_flush_denormal(True)
import random_seed_setter

random_seed_setter.set_random_seeds()

  "class": algorithms.Blowfish,
2024-03-31 09:31:08.901711: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


NumPy random seed set with value: 42


2024-03-31 09:31:08.953011: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow random seed set with value: 42
PyTorch random seed set with value: 42


In [2]:
word2vec = Word2Vec.load("SG_300_25_20/SG_300_25_20.model").wv

In [3]:
print(word2vec.vector_size)

300


In [4]:
html_tag = r"<\S*>"
invalid_characters = r"[^a-zA-Z0-9 ]"
percentage = r"[0-9]+%"
hashtag = r"#\S*"
numeric = r"[0-9]+"
at = r"@\S*"
address = r"\S+@\S+\.\S+"
link = r"(https?:\/\/|www.)\S+"

def preprocess(s):
    # Remove html tags.
    s = re.sub(html_tag, " ", s)

    # Substitute percents.
    s = re.sub(percentage, " procent ", s)

    # Substitute hashtags.
    s = re.sub(hashtag, " hashtag ", s)

    # Substitute at.
    s = re.sub(at, " entitate ", s)

    # Substitute numbers. 
    s = re.sub(numeric, " numar ", s)

    # Substitute addresses.
    s = re.sub(address, " adresa ", s)

    # Substitute links.
    s = re.sub(link, " link ", s)

    # Remove accents, diacritics.
    s = unidecode(s)

    # Keep only these characters
    s = re.sub(invalid_characters, " ", s)
    return s

In [5]:
romainian_stopwords = set(stopwords.words("romanian"))

def tokenize(s):
    tokens = s.split()
    tokens = list(filter(lambda x: x not in romainian_stopwords, tokens))
    tokens = [s.lower() for s in tokens]
    return tokens

In [6]:
# Converts a list of tokens to a vector.
# Only look at the first num_tokens tokens.
def vectorize(tokens, num_tokens=2):
    vectors = []
    for i in range(num_tokens):
        if i >= len(tokens):
            vectors.append(torch.zeros(word2vec.vector_size))
        else:
            if tokens[i] in word2vec:
                vectors.append(torch.tensor(word2vec[tokens[i]])) 
            else:
                vectors.append(torch.zeros(word2vec.vector_size))
    return torch.cat(vectors, dim=0)

In [7]:
def load_train():
    df = pd.read_csv('train.csv')
    df = df.fillna("")

    positive = sum(df["class"] == 1)
    negative = sum(df["class"] == 0)
    print("Positive to negative ratio: ", positive/negative)

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=15) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=35) for x in tqdm(df["title"])]

    labels = list(df["class"])

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x , labels 

In [8]:
x, labels = load_train() 
train_x, test_x, train_labels, test_labels = train_test_split(x, labels, test_size=0.2, shuffle=True)

Positive to negative ratio:  1.7887540996562217


100%|██████████| 70575/70575 [01:08<00:00, 1034.53it/s]
100%|██████████| 70575/70575 [00:17<00:00, 4146.99it/s]


In [9]:
class MyDataset(Dataset):
    def __init__(self, x, labels):
        self.x = x
        self.labels = labels
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index], 1 if self.labels[index] == True else 0

NUM_FEATURES = train_x[0].shape[0]
train_dataset = MyDataset(train_x, train_labels)
test_dataset = MyDataset(test_x, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [10]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=20,kernel_size=1500, stride=300)
        self.conv2 = nn.Conv1d(in_channels=20, out_channels=40,kernel_size=3)
        self.layer1 = nn.Linear(1760, 400)
        self.layer2 = nn.Linear(400, 2)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.75)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1)

    def forward(self,x):
        x = torch.unsqueeze(x, dim=1)

        x = self.conv1(x)
        x = self.relu(x)
        x = self.dropout(x)


        x = self.conv2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = torch.flatten(x, start_dim=1)

        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layer2(x)
        x = self.softmax(x)

        return x

    def predict(self, x):
        x = self.forward(x)
        return torch.argmax(x, dim=1)

In [15]:
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.7, 1]))
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [16]:
def train_one_epoch():
    model.train(True)
    running_loss = 0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_dataloader)):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

        if i % 40 == 39:
            print(f"Running loss: {running_loss/(i+1)}") 

    return running_loss/(i+1)

In [17]:
def test_model():
    model.eval()

    predictions = []
    targets = []

    with torch.no_grad():
        running_vloss = 0.0

        for i, vdata in enumerate(test_dataloader):
            x , labels = vdata
            outputs = model(x)

            vloss = loss_fn(outputs, labels)
            running_vloss += vloss.item()

            current_predictions = model.predict(x) 
            predictions+=current_predictions
            targets += labels

        avg_vloss = running_vloss / (i + 1)

        precision = precision_score(targets, predictions)
        recall = recall_score(targets, predictions)

        return avg_vloss, balanced_accuracy_score(targets, predictions), (precision, recall) 

In [18]:
EPOCHS = 80 

best_accuracy = 0

for epoch in range(EPOCHS):
    train_loss = train_one_epoch()
    test_loss, test_accuracy, test_pr = test_model()

    print(f"{epoch}: Train loss: {train_loss}, Test loss: {test_loss}, Test accuracy: {test_accuracy}, Test pr: {test_pr}")

    if test_accuracy > best_accuracy:
        print("Current best!")
        torch.save(model, "best_model.pt")
        best_accuracy = test_accuracy



202it [00:14, 13.10it/s]

Running loss: 0.3671245801448822


221it [00:15, 13.88it/s]


38: Train loss: 0.3668378765496733, Test loss: 0.3658761850425175, Test accuracy: 0.944423692833017, Test pr: (0.9639388188009378, 0.9527698079894064)


42it [00:02, 14.09it/s]

Running loss: 0.36801323518157003


82it [00:05, 14.26it/s]

Running loss: 0.3678541205823421


122it [00:08, 14.62it/s]

Running loss: 0.36572831372419995


162it [00:11, 15.07it/s]

Running loss: 0.3664442190900445


202it [00:13, 14.42it/s]

Running loss: 0.36658600822091103


221it [00:15, 14.54it/s]


39: Train loss: 0.3663875865181107, Test loss: 0.3653536891298635, Test accuracy: 0.9455267433795893, Test pr: (0.961139324354162, 0.9607150739351137)


42it [00:02, 14.42it/s]

Running loss: 0.36913229823112487


82it [00:05, 14.68it/s]

Running loss: 0.3674187812954187


122it [00:08, 13.90it/s]

Running loss: 0.3663134244581064


162it [00:11, 14.09it/s]

Running loss: 0.36608009599149227


202it [00:14, 14.13it/s]

Running loss: 0.36640569776296616


221it [00:15, 14.39it/s]


40: Train loss: 0.3662678038372713, Test loss: 0.3642613483326776, Test accuracy: 0.9462563413558938, Test pr: (0.9668985637342908, 0.9508938424188921)
Current best!


42it [00:02, 14.89it/s]

Running loss: 0.3663144886493683


82it [00:05, 15.40it/s]

Running loss: 0.3647403262555599


122it [00:08, 14.55it/s]

Running loss: 0.3645711526274681


162it [00:11, 13.56it/s]

Running loss: 0.36466853935271504


202it [00:13, 15.01it/s]

Running loss: 0.36602588161826133


221it [00:15, 14.60it/s]


41: Train loss: 0.36644303394119127, Test loss: 0.3643715786082404, Test accuracy: 0.945700022489486, Test pr: (0.962040725984949, 0.9592805120282498)


42it [00:02, 16.38it/s]

Running loss: 0.36388506889343264


82it [00:05, 13.80it/s]

Running loss: 0.3640477418899536


122it [00:08, 14.27it/s]

Running loss: 0.3645896228651206


162it [00:11, 16.17it/s]

Running loss: 0.36480686757713554


202it [00:13, 14.99it/s]

Running loss: 0.3657141228020191


221it [00:14, 14.87it/s]


42: Train loss: 0.3660958759924945, Test loss: 0.36299709337098257, Test accuracy: 0.9468486831464648, Test pr: (0.9710589036431733, 0.9441624365482234)
Current best!


42it [00:02, 14.46it/s]

Running loss: 0.3623146750032902


82it [00:05, 13.59it/s]

Running loss: 0.36198327243328093


122it [00:08, 14.22it/s]

Running loss: 0.36343302751580875


162it [00:11, 14.37it/s]

Running loss: 0.36480903159826994


202it [00:14, 14.71it/s]

Running loss: 0.3651149807870388


221it [00:15, 14.32it/s]


43: Train loss: 0.36544139892267424, Test loss: 0.3645530625113419, Test accuracy: 0.9446339011728383, Test pr: (0.9578497919859864, 0.9654601633193556)


42it [00:03, 13.34it/s]

Running loss: 0.36483409255743027


82it [00:05, 14.35it/s]

Running loss: 0.3654954768717289


122it [00:08, 14.67it/s]

Running loss: 0.36539293204744655


162it [00:11, 14.16it/s]

Running loss: 0.36569860856980085


202it [00:14, 14.87it/s]

Running loss: 0.36604640424251556


221it [00:15, 14.13it/s]


44: Train loss: 0.3663387898675996, Test loss: 0.36376979627779554, Test accuracy: 0.9470443032393376, Test pr: (0.968792248760703, 0.9489075259324652)
Current best!


42it [00:02, 15.02it/s]

Running loss: 0.3656455554068089


82it [00:05, 15.65it/s]

Running loss: 0.3641392536461353


122it [00:08, 13.74it/s]

Running loss: 0.364577629417181


162it [00:11, 14.65it/s]

Running loss: 0.3642921131104231


202it [00:14, 13.94it/s]

Running loss: 0.36461193427443506


221it [00:15, 14.29it/s]


45: Train loss: 0.36470715195884534, Test loss: 0.3635023704596928, Test accuracy: 0.9467428855980502, Test pr: (0.9636081216021303, 0.958397704700949)


42it [00:02, 15.05it/s]

Running loss: 0.36260834634304046


82it [00:05, 15.93it/s]

Running loss: 0.36546548567712306


122it [00:08, 14.06it/s]

Running loss: 0.3660911393662294


162it [00:10, 14.95it/s]

Running loss: 0.3658384716138244


202it [00:13, 14.82it/s]

Running loss: 0.3654175215959549


221it [00:14, 14.91it/s]


46: Train loss: 0.3650251616180213, Test loss: 0.36369384452700615, Test accuracy: 0.9473064985879319, Test pr: (0.9664617104527669, 0.9539836680644449)
Current best!


42it [00:02, 14.93it/s]

Running loss: 0.3660546995699406


82it [00:05, 13.40it/s]

Running loss: 0.362937181442976


122it [00:08, 14.70it/s]

Running loss: 0.36255701233943305


162it [00:11, 15.48it/s]

Running loss: 0.36371791288256644


202it [00:14, 14.20it/s]

Running loss: 0.36404986649751664


221it [00:15, 14.08it/s]


47: Train loss: 0.36437278176864346, Test loss: 0.3630030554320131, Test accuracy: 0.9468751298037317, Test pr: (0.9698134539287733, 0.9465901566983006)


42it [00:02, 14.43it/s]

Running loss: 0.3629554770886898


82it [00:05, 14.88it/s]

Running loss: 0.36405958943068983


122it [00:08, 13.55it/s]

Running loss: 0.36358606393138565


162it [00:11, 14.79it/s]

Running loss: 0.363226311840117


202it [00:14, 13.78it/s]

Running loss: 0.36323856577277186


221it [00:15, 14.15it/s]


48: Train loss: 0.36348567033245555, Test loss: 0.3621705525687763, Test accuracy: 0.947726926186921, Test pr: (0.9661868095078674, 0.9554182299713088)
Current best!


42it [00:02, 14.86it/s]

Running loss: 0.36405414044857026


82it [00:05, 14.65it/s]

Running loss: 0.36435453705489634


122it [00:08, 14.95it/s]

Running loss: 0.364347218722105


162it [00:11, 14.56it/s]

Running loss: 0.36462846491485834


202it [00:14, 12.32it/s]

Running loss: 0.36396681889891624


221it [00:15, 14.02it/s]


49: Train loss: 0.36403073741300074, Test loss: 0.36371585939611706, Test accuracy: 0.9466521458284842, Test pr: (0.9673327346205658, 0.9508938424188921)


42it [00:02, 14.95it/s]

Running loss: 0.3632418505847454


82it [00:05, 14.48it/s]

Running loss: 0.36360433921217916


122it [00:08, 14.04it/s]

Running loss: 0.364128969113032


162it [00:11, 14.19it/s]

Running loss: 0.36356229372322557


202it [00:13, 14.92it/s]

Running loss: 0.3640508730709553


221it [00:15, 14.43it/s]


50: Train loss: 0.36363619633389815, Test loss: 0.3628521407289164, Test accuracy: 0.9478906290299214, Test pr: (0.9667039106145251, 0.9547561244758331)
Current best!


42it [00:02, 14.97it/s]

Running loss: 0.3595005951821804


82it [00:05, 14.38it/s]

Running loss: 0.36162086203694344


122it [00:08, 14.69it/s]

Running loss: 0.36367462774117787


162it [00:11, 13.28it/s]

Running loss: 0.3627863174304366


202it [00:14, 14.40it/s]

Running loss: 0.3626763243973255


221it [00:15, 14.16it/s]


51: Train loss: 0.36311492027200726, Test loss: 0.36309050396084785, Test accuracy: 0.9474318963633466, Test pr: (0.9653597683225663, 0.9564113882145222)


42it [00:02, 14.92it/s]

Running loss: 0.36449732556939124


82it [00:06, 13.39it/s]

Running loss: 0.36446305327117445


122it [00:09, 14.47it/s]

Running loss: 0.3638657306631406


162it [00:11, 14.99it/s]

Running loss: 0.3639094289392233


202it [00:14, 14.92it/s]

Running loss: 0.36378364622592924


221it [00:15, 13.90it/s]


52: Train loss: 0.36361771093774164, Test loss: 0.3625666765230043, Test accuracy: 0.9478541474932041, Test pr: (0.9645870337477798, 0.9588391083645994)


42it [00:02, 12.79it/s]

Running loss: 0.36277820318937304


82it [00:05, 14.65it/s]

Running loss: 0.3627469953149557


122it [00:08, 14.49it/s]

Running loss: 0.36266699333985647


162it [00:11, 14.67it/s]

Running loss: 0.36390418764203786


202it [00:14, 13.67it/s]

Running loss: 0.36390162378549573


221it [00:15, 14.16it/s]


53: Train loss: 0.3635022513467262, Test loss: 0.3632882961205074, Test accuracy: 0.9478577945549411, Test pr: (0.9635900841080124, 0.9608254248510263)


42it [00:03, 13.87it/s]

Running loss: 0.3652364112436771


82it [00:05, 15.64it/s]

Running loss: 0.36496509946882727


122it [00:08, 14.35it/s]

Running loss: 0.3642127086718877


162it [00:11, 13.87it/s]

Running loss: 0.36337241157889366


202it [00:14, 14.48it/s]

Running loss: 0.36300669819116593


221it [00:16, 13.78it/s]


54: Train loss: 0.363126572170948, Test loss: 0.3630347517984254, Test accuracy: 0.9475080478859643, Test pr: (0.9656671497046038, 0.9559699845508718)


42it [00:02, 14.70it/s]

Running loss: 0.36570002138614655


57it [00:03, 14.65it/s]


KeyboardInterrupt: 

In [19]:
model = Model()
model = torch.load("best_model.pt")
print(test_model())

(0.3633933344057628, 0.9478906290299214, (0.9667039106145251, 0.9547561244758331))


In [21]:
def load_test():
    df = pd.read_csv('test.csv')
    df = df.fillna("")

    contents =  [vectorize(tokenize(preprocess(x)), num_tokens=15) for x in tqdm(df["content"])]
    titles = [vectorize(tokenize(preprocess(x)), num_tokens=35) for x in tqdm(df["title"])]

    x = []
    for i in range(len(contents)):
        vec = torch.concat([contents[i], titles[i]], dim=0)
        x.append(vec)

    return x 

In [22]:
x_test = load_test() 

100%|██████████| 36669/36669 [00:35<00:00, 1040.86it/s]
100%|██████████| 36669/36669 [00:16<00:00, 2202.34it/s]


In [23]:

ids = []
predictions = []
model.eval()
with torch.no_grad():
    for i in tqdm(range(len(x_test))):
        id = i
        prediction = model.predict(torch.unsqueeze(x_test[i], 0)).item()

        ids.append(id)
        predictions.append(prediction)


df = pd.DataFrame(data={"id":ids, "class": predictions})
print(df)

100%|██████████| 36669/36669 [00:23<00:00, 1536.57it/s]

          id  class
0          0      0
1          1      0
2          2      0
3          3      0
4          4      0
...      ...    ...
36664  36664      0
36665  36665      0
36666  36666      0
36667  36667      1
36668  36668      0

[36669 rows x 2 columns]





In [99]:
df.to_csv("29.csv", index=False)