In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score
from unidecode import unidecode
import re
import csv 
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from tqdm import tqdm 
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
torch.set_flush_denormal(True)
import random_seed_setter
from transformers import BertTokenizer, BertModel, AutoTokenizer, TFAutoModel, AutoModel

random_seed_setter.set_random_seeds()

NumPy random seed set with value: 42
TensorFlow random seed set with value: 42
PyTorch random seed set with value: 42


In [18]:
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-small")
bert = AutoModel.from_pretrained("readerbench/RoBERT-small")
hidden_size = 256 

In [19]:
outputs = tokenizer(["exemplu de propoziție   asdlfsjldfjsl djflsdj fs", "alta propozitie", "aaaa", "bbb"], return_tensors="pt", max_length=100, padding=True, truncation=True)
outputs = bert(**outputs)
print(outputs[1].shape)

torch.Size([4, 256])


In [20]:
html_tag = r"<\S*>"
invalid_characters = r"[^a-zA-Z0-9 \.\,]"
percentage = r"[0-9]+%"
hashtag = r"#\S*"
numeric = r"[0-9]+"
at = r"@\S*"
address = r"\S+@\S+\.\S+"
link = r"(https?:\/\/|www.)\S+"

to_sub = f"({html_tag})|({percentage})|({hashtag})|({at})|({address})|({link})"

def preprocess(s):
    s = re.sub(to_sub, " ", s)
    # Remove accents, diacritics.
    s = unidecode(s)
    return s

In [21]:
def tokenize(s, length=50):
    outputs = tokenizer(s, return_tensors="pt", max_length=length, padding='max_length', truncation=True)
    return outputs

In [22]:
def load_train():
    df = pd.read_csv('train.csv')
    df = df.fillna("")

    positive = sum(df["class"] == 1)
    negative = sum(df["class"] == 0)
    print("Positive to negative ratio: ", positive/negative)


    print("Preprocessing")
    titles = [preprocess(x) for x in tqdm(df["title"])]
    print("Tokenizing")

    tokens = tokenize(titles, length=35)

    labels = list(df["class"])

    return tokens["input_ids"], tokens["token_type_ids"], tokens["attention_mask"], labels 

In [23]:
ids, ttids, masks, labels = load_train() 
train_ids, test_ids, train_ttids , test_ttids, train_masks, test_masks, train_labels, test_labels = train_test_split(ids, ttids, masks, labels, test_size=0.2, shuffle=True)

Positive to negative ratio:  1.7887540996562217
Preprocessing


100%|██████████| 70575/70575 [00:03<00:00, 21470.74it/s]


Tokenizing


In [24]:
class MyDataset(Dataset):
    def __init__(self, ids, ttids, masks, labels):
        self.ids = ids 
        self.ttids = ttids 
        self.masks = masks
        self.labels = labels 
    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
        return self.ids[index], self.ttids[index], self.masks[index],  1 if self.labels[index] == True else 0

train_dataset = MyDataset(train_ids, train_ttids, train_masks, train_labels)
test_dataset = MyDataset(test_ids, test_ttids, test_masks, test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True)

In [25]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = bert  
        self.layer1 = nn.Linear(hidden_size, 2)

    def forward(self,ids, ttids, masks):
        x = self.model(ids, ttids, masks)[1]
        x = self.layer1(x)
        return x

    def predict(self, ids, ttids, masks):
        x = self.forward(ids, ttids, masks)
        return torch.argmax(x, dim=1)

In [26]:
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.7, 1]))
model = Model()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.1)

In [27]:
def train_one_epoch():
    model.train(True)
    running_loss = 0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_dataloader)):
        # Every data instance is an input + label pair
        ids , ttids, masks, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(ids, ttids, masks)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

        if i % 40 == 39:
            print(f"Running loss: {running_loss/(i+1)}") 

    return running_loss/(i+1)

In [28]:
def test_model():
    model.eval()

    predictions = []
    targets = []

    with torch.no_grad():
        running_vloss = 0.0

        for i, vdata in enumerate(test_dataloader):
            ids ,ttids, masks,  labels = vdata
            outputs = model(ids, ttids, masks)

            vloss = loss_fn(outputs, labels)
            running_vloss += vloss.item()

            current_predictions = model.predict(ids, ttids, masks) 
            predictions+=current_predictions
            targets += labels

        avg_vloss = running_vloss / (i + 1)

        precision = precision_score(targets, predictions)
        recall = recall_score(targets, predictions)

        return avg_vloss, balanced_accuracy_score(targets, predictions), (precision, recall) 

In [29]:
EPOCHS = 80 

best_accuracy = 0

for epoch in range(EPOCHS):
    train_loss = train_one_epoch()
    test_loss, test_accuracy, test_pr = test_model()

    print(f"{epoch}: Train loss: {train_loss}, Test loss: {test_loss}, Test accuracy: {test_accuracy}, Test pr: {test_pr}")

    if test_accuracy > best_accuracy:
        print("Current best!")
        torch.save(model, "best_model.pt")
        best_accuracy = test_accuracy



1it [00:07,  7.02s/it]

Running loss: 0.6714516282081604


2it [00:14,  7.22s/it]

Running loss: 0.7444787621498108


3it [00:21,  7.07s/it]

Running loss: 0.6861100594202677


4it [00:28,  7.08s/it]

Running loss: 0.6712859272956848


5it [00:35,  7.12s/it]

Running loss: 0.6349619090557098


6it [00:42,  7.14s/it]

Running loss: 0.6118891338507334


7it [00:49,  7.16s/it]

Running loss: 0.5943820263658252


8it [00:56,  7.05s/it]

Running loss: 0.5778361447155476


8it [00:58,  7.26s/it]


KeyboardInterrupt: 

In [None]:
model = Model()
model = torch.load("best_model.pt")
print(test_model())

(0.15772280376404524, 0.9413051016735747, (0.9624119028974158, 0.9505027068832174))


In [None]:
def load_test():
    df = pd.read_csv('test.csv')
    df = df.fillna("")

    print("Preprocessing")
    titles = [preprocess(x) for x in tqdm(df["title"])]
    print("Tokenizing")
    titles = tokenize(titles, length=30) 
    print("Vectorizing")
    titles = vectorize(titles) 

    labels = list(df["class"])

    return titles, labels 

In [None]:
x_test = load_test() 

In [None]:

ids = []
predictions = []
model.eval()
with torch.no_grad():
    for i in tqdm(range(len(x_test))):
        id = i
        prediction = model.predict(torch.unsqueeze(x_test[i], 0)).item()

        ids.append(id)
        predictions.append(prediction)


df = pd.DataFrame(data={"id":ids, "class": predictions})
print(df)

In [None]:
df.to_csv("30.csv", index=False)