Collecting torchvision
  Downloading torchvision-0.9.1-cp39-cp39-manylinux1_x86_64.whl (17.3 MB)
[K     |████████████████████████████████| 17.3 MB 27.5 MB/s eta 0:00:01
Installing collected packages: torchvision
Successfully installed torchvision-0.9.1
You should consider upgrading via the '/home/krajda/misinformation/bin/python3 -m pip install --upgrade pip' command.[0m


In [100]:
import numpy as np
import pandas as pd

import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchmetrics.functional import accuracy, f1
from torchvision import transforms

from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

import pytorch_lightning as pl

import warnings
warnings.simplefilter('ignore')

from pytorch_lightning import loggers as pl_loggers

In [101]:
class Config:
    lr = 1e-5
    max_len = 512
    train_bs = 64
    valid_bs = 64
    train_pcent = 0.80
    num_workers = 80

In [105]:
class SentimentDataset( ):
    def __init__(self, texts, labels):
        self.tokenizer = transformers.XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

        self.inputs = self.tokenize(texts)
        self.labels = labels                       
        self.transform = transforms.Compose([transforms.ToTensor()])
        
    def tokenize(self, texts):
        return self.tokenizer.batch_encode_plus(
            texts,
            None,
            pad_to_max_length=True,
            max_length = 512,
            truncation='longest_first',
            return_tensors='pt'
        )
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        return (self.inputs['input_ids'][idx],
        self.inputs['attention_mask'][idx],
        self.inputs['token_type_ids'][idx],
        self.labels[idx])


class SentimentDataModule(pl.LightningDataModule):
    
    def __init__(self):
        super().__init__()

    def prepare_data(self):
        df = pd.read_csv('./advertising_sentiment_dataset.csv')
        df = df[['text', 'sentiment']]

        sentimap = {'Positive': 1, 'Negative': 2, 'Neutral': 0}

        df['sentiment'] = df.apply(lambda x: sentimap[x['sentiment']] if x['sentiment'] is not np.nan else 0, axis=1)

        self.data = df

    def setup(self, stage):
        nb_training_samples = (int)(len(self.data)*0.8)
        
        
        self.training_set = SentimentDataset(
            texts=self.data[:nb_training_samples]['text'].values,
            labels=self.data[:nb_training_samples]['sentiment'].values,
        )

        self.validation_set = SentimentDataset(
            texts=self.data[nb_training_samples:]['text'].values,
            labels=self.data[nb_training_samples:]['sentiment'].values,
        )
    
    def train_dataloader(self):
        return DataLoader(self.training_set, batch_size=64)

    def val_dataloader(self):
        return DataLoader(self.validation_set, batch_size=64)

    def test_dataloader(self):
        return None

In [None]:
class Model(pl.LightningModule):
    def __init__(self) -> None:
        super().__init__()
#         self.save_hyperparameters()
        
        self.pretrained_model = transformers.RobertaModel.from_pretrained("laugustyniak/roberta-polish-web-embedding-v1")
        
        for param in self.pretrained_model.parameters():
            param.requires_grad = False
        
        self.model = nn.Sequential(nn.Dropout(0.5),
                                nn.Linear(self.pretrained_model.config.hidden_size, 256), nn.ReLU(),
                                nn.Linear(256, 16), nn.ReLU(),
                                nn.Linear(16, 3), nn.ReLU())
        self.loss = F.cross_entropy
        self.softmax = torch.nn.Softmax()
    
    def forward(self, input_ids, attention_mask, token_type_ids) -> torch.Tensor:
        embeddings = self.pretrained_model(input_ids=input_ids,
                         attention_mask=attention_mask,
                         token_type_ids=token_type_ids,
                         output_hidden_states=True)

        logits = self.model(embeddings['pooler_output'])
        return logits
    
    def training_step(self, batch, batch_nb):
        
        input_ids, attention_mask, token_type_ids, label = batch
        
        y = self(input_ids, attention_mask, token_type_ids)
        loss = self.loss(y, label)
        
        self.log('train_loss', loss, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def log_metrics(self, y, label, step):
        y_soft = self.softmax(y)
        loss = self.loss(y, label)
        acc = accuracy(y_soft, label)
        f_1 = f1(y_soft, label)
        
        self.log(f"{step}_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log(f"{step}_acc", acc, on_epoch=True, prog_bar=True, logger=True)
        self.log(f"{step}_f1", f_1,on_epoch=True, prog_bar=True, logger=True)
        
    
    def validation_step(self, batch, batch_nb):
        input_ids, attention_mask, token_type_ids, label = batch
        y = self(input_ids, attention_mask, token_type_ids)
        self.log_metrics(y, label, 'val')       

        
    def test_step(self, batch, batch_nb):
        input_ids, attention_mask, token_type_ids, label = batch
        y = self(input_ids, attention_mask, token_type_ids)
        self.log_metrics(y, label, 'test')  
    
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=Config.lr)
    
    def predict(self, texts: list) -> dict:
        with torch.no_grad():
            model_input = SentimentDataset(texts, None).inputs
            model_out = self(**model_input)
            model_out = self.softmax(model_out)
            model_out = model_out.numpy()
            return np.argmax(model_out, axis=1)
        


In [None]:
model = Model()
datamodule = SentimentDataModule()

trainer = pl.Trainer(
    logger=pl_loggers.TensorBoardLogger('logs/'),
    max_epochs=100, 
    gpus=1,
)
trainer.fit(model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4]

  | Name             | Type         | Params
--------------------------------------------------
0 | pretrained_model | RobertaModel | 124 M 
1 | model            | Sequential   | 201 K 
2 | softmax          | Softmax      | 0     
--------------------------------------------------
201 K     Trainable params
124 M     Non-trainable params
125 M     Total params
500.293   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:

x = ['ala ma kota', 'duda to debil', 'O Narodowej Strategii Onkologicznej : \ " Pan prezydent hucznie ją ogłosił , ale to nie jest strategia . Taki dokument powinien zawierać cele i działania służące ich realizacji , harmonogram , kosztorys , mierniki i sposoby oceny . Nie zawiera . Traktuję to jako element kampanii wyborczej \ " https://twitter.com/mamago25/status/1232230675870253056']

model.predict(x)