In [156]:
from functools import partial
from operator import itemgetter, methodcaller
from typing import Tuple, Callable

import numpy as np
import pandas as pd
import hiddenlayer as hl
import torch
import torch.nn.functional as F
from ignite.contrib.handlers import ProgressBar
from ignite.engine import Engine
from ignite.metrics import RunningAverage, Accuracy, Loss
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

from nya_ml import embeddings
from nya_ml.preprocessing.tokenizer import Tokenizer
from nya_ml_research.config import MODELS_PATH, DATA_PATH
from nya_ml_research.src.models.ignitecnn import TextCNN
from nya_ml_research.src.models.logreg import LogisticRegression
from nya_utils.functools import identity

In [3]:
tqdm.pandas()

In [4]:
%load_ext autoreload
%autoreload 2

In [104]:
keyed_vectors = embeddings.get_source('ruwiki').load(MODELS_PATH / 'embeddings')

In [105]:
tokenizer = Tokenizer(keyed_vectors)
tokenize = partial(tokenizer.tokenize, to=list, pad=50)
vectorize = tokenizer.vectorize



In [98]:
# weights = embedding.get_torch_tensor_embeddings()
weights = torch.from_numpy(tokenizer.vectors.vectors)

In [8]:
class SimpleConvolutionTextModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_dim: int, num_embeddings: int,
                 embedding_weights: torch.Tensor):
        super(SimpleConvolutionTextModel, self).__init__()

        self.embedding = nn.Embedding(num_embeddings, embedding_dim, _weight=embedding_weights)
        self.embedding.requires_grad_(False)
        self.conv_1 = nn.Conv1d(embedding_dim, hidden_size, 3, stride=3, bias=False)
        self.pool = nn.MaxPool1d(3)
        self.lin = nn.Linear(hidden_size * int(input_size / 3), output_size, bias=False)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1, 2)
        x = x.float()

        x = self.conv_1(x)
        x = F.relu(x)
        x = x.view(-1, x.size(1) * x.size(2))
        x = self.lin(x)

        y = torch.softmax(x, dim=1)

        return y


In [49]:



def eval(model, data, loss, batch_size, verbose=False):
    test_dataloader = DataLoader(TensorDataset(*data), batch_size=batch_size)

    model.eval()
    y_pred = []
    y_true = []

    with torch.no_grad():
        for X, y in test_dataloader:
            # print(X, y)
            # print(torch.argmax(model(X).detach(), dim=1))
            y_pred += model(X).detach()
            y_true += y.detach() # torch.argmax(y, dim=1)
    if verbose:
        print(y_pred, y_true)

    print('Loss:', loss.item())
    if verbose:
        print(classification_report(y_true, y_pred, digits=3))


In [10]:
df = pd.read_csv(DATA_PATH / 'raw' / 'ru-tweet-corp.csv', names=['text', 'label'], usecols=[4, 5])

In [11]:
df = shuffle(df)

In [12]:
df.head(10)

Unnamed: 0,text,label
220374,В моей аудитории ни одна сучка даже не думала ...,-1
134851,@NikiforovaValya ай яй( зачем с мамой ругаешься?,-1
27330,@katyaryba будет тебе авг в личное пользование...,1
137649,@LarinDmytro теперь рыги этот мусор сфотографи...,-1
48391,УРААААА.. Я буду шефом на Тест-Драйв #УрФУ\nее...,1
56930,"@Rus_Smash С днем рождения, Руслан!) Здоровья,...",1
116297,"Как же заебал телефон падать, все сенсорные те...",-1
116545,он мне сегодня снился:( http://t.co/8GhbjIw3rM,-1
31340,"RT @KOA143: теперь,протянув максимум времени,м...",1
28327,"К черту фигуру, если есть нутелла) http://t.co...",1


In [13]:
df.label.value_counts()

 1    114911
-1    111923
Name: label, dtype: int64

In [14]:
df['label'][df.label == -1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'][df.label == -1] = 0


In [15]:
df.label.value_counts()

1    114911
0    111923
Name: label, dtype: int64

In [33]:
limit = 100

In [34]:
X = df.text.head(limit)
y = df.label.head(limit)

In [35]:
X = X.progress_apply(tokenize)
# y = y.progress_apply(lambda label: [label, 1 - label][::-1])

100%|██████████| 100/100 [00:00<00:00, 300.31it/s]


In [36]:
X = np.array(X.tolist())
y = np.array(y.tolist())

In [37]:
X = torch.from_numpy(X)
y = torch.from_numpy(y).float()

In [38]:
data = train_test_split(X, y, test_size=0.2, stratify=y)

In [39]:
class ConvolutionTextModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_dim: int, num_embeddings: int,
                 embedding_weights: torch.Tensor, kernel_sizes, drop_prob: float = 0.5):
        super(ConvolutionTextModel, self).__init__()

        self.embedding = nn.Embedding(num_embeddings, embedding_dim, _weight=embedding_weights)
        # self.embedding.requires_grad_(False)
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, hidden_size, k)
            for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(drop_prob)

        self.lin = nn.Linear(hidden_size * len(kernel_sizes), output_size, bias=False)

    def forward(self, x):
        x = self.embedding(x)
        # print(x.size())
        x = x.transpose(1, 2)
        # print(x.size())
        x = x.float()

        x = [F.relu(conv(x)) for conv in self.convs]
        # print(len(x), list(map(methodcaller('size'), x)))
        x = [F.max_pool1d(x_, list(x_.size())[-1]) for x_ in x]
        # print(len(x), list(map(methodcaller('size'), x)))
        x = torch.cat(x, dim=1)
        # print(x.size())
        x = x.squeeze()
        x = self.dropout(x)
        x = self.lin(x)
        # print(x.size())

        return torch.softmax(x, 0)


In [40]:
model = ConvolutionTextModel(
    input_size=50,
    hidden_size=6,
    output_size=2,
    kernel_sizes=[2, 3, 4],
    embedding_dim=300,
    num_embeddings=len(tokenizer.vectors),
    embedding_weights=weights
)

In [41]:
model = SimpleConvolutionTextModel(
    input_size=50,
    hidden_size=6,
    output_size=2,
    embedding_dim=300,
    num_embeddings=len(tokenizer.vectors),
    embedding_weights=weights
)

In [42]:
model = LogisticRegression(50, 1)

In [43]:
model = TextCNN(
    vocab_size=len(tokenizer.vectors),
    embedding_dim=tokenizer.vectors.vector_size,
    kernel_sizes=[3, 4, 5],
    num_filters=100,
    num_classes=1,
    d_prob=0.5,
    embedding_weights=weights
)

In [51]:
model(torch.tensor([tokenize('привет всем любителям пончиков')]))

tensor(0., grad_fn=<SqueezeBackward0>)

In [45]:
print(model)

TextCNN(
  (embedding): Embedding(249334, 300)
  (conv): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [50]:
train(model=model, data=data, epochs=10, learning_rate=0.1)

 10%|█         | 1/10 [00:01<00:15,  1.67s/it]

Loss: 0.6931471824645996


 20%|██        | 2/10 [00:02<00:11,  1.41s/it]

Loss: 0.6931471824645996


 30%|███       | 3/10 [00:04<00:09,  1.34s/it]

Loss: 0.6931471824645996


 40%|████      | 4/10 [00:05<00:07,  1.31s/it]

Loss: 0.6931471824645996


 50%|█████     | 5/10 [00:06<00:06,  1.29s/it]

Loss: 0.6931471824645996


 60%|██████    | 6/10 [00:07<00:05,  1.28s/it]

Loss: 0.6931471824645996


 70%|███████   | 7/10 [00:09<00:03,  1.27s/it]

Loss: 0.6931471824645996


 80%|████████  | 8/10 [00:10<00:02,  1.27s/it]

Loss: 0.6931471824645996


 90%|█████████ | 9/10 [00:11<00:01,  1.26s/it]

Loss: 0.6931471824645996


100%|██████████| 10/10 [00:12<00:00,  1.30s/it]

Loss: 0.6931471824645996





In [36]:
list(model.parameters())

[Parameter containing:
 tensor([[-5.7760, -1.4857,  1.4283,  ..., -0.9964, -1.0746, -0.8292],
         [ 4.5155,  3.8298, -0.6947,  ..., -0.4361,  4.5493,  4.5197],
         [-2.3417, -3.3157,  0.3458,  ...,  6.7954,  0.7102,  4.2465],
         ...,
         [-0.5283, -0.0311, -0.2655,  ...,  0.0096, -0.1814,  0.0298],
         [-0.1142,  0.9447,  0.2502,  ..., -0.0167,  0.4876, -0.2878],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 Parameter containing:
 tensor([[[-8.1584, -9.0035,  8.9808],
          [-9.5645, -9.0069,  8.9741],
          [-8.8145, -9.0149, -9.0043],
          ...,
          [ 8.9228, -9.0034,  8.9753],
          [-9.0200,  8.9720,  9.0322],
          [-8.7616, -8.9886,  8.9721]],
 
         [[ 9.0071,  8.9946, -8.9975],
          [-9.0158, -8.9667,  8.9947],
          [-9.0036, -9.0022,  8.9979],
          ...,
          [ 8.9793, -9.0294, -8.9816],
          [ 9.0162, -9.0227, -8.9813],
          [-8.9997,  8.9986, -8.9849]],
 
        

In [74]:
eval(model, (data[1], data[3]), None, 32)

[tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
from nya_ml_research.src.eval import Eval
from nya_ml_research.src.step import Step

In [171]:
limit = 10_000

df = pd.read_csv(DATA_PATH / 'raw' / 'ru-tweet-corp.csv', names=['text', 'label'], usecols=[4, 5])
df = shuffle(df)

X = df.text.head(limit)
y = df.label.head(limit)

X = X.progress_apply(tokenize)
# y = y.progress_apply(lambda label: [label, 1 - label][::-1])

X = np.array(X.tolist())
y = np.array(y.tolist())

X = torch.from_numpy(X)
y = torch.from_numpy(y).float()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


  0%|          | 0/10000 [00:00<?, ?it/s][A
  0%|          | 35/10000 [00:00<00:28, 346.50it/s][A
  1%|          | 81/10000 [00:00<00:24, 410.62it/s][A
  1%|▏         | 131/10000 [00:00<00:21, 449.15it/s][A
  2%|▏         | 176/10000 [00:00<00:22, 427.70it/s][A
  2%|▏         | 223/10000 [00:00<00:22, 442.38it/s][A
  3%|▎         | 268/10000 [00:00<00:22, 436.24it/s][A
  3%|▎         | 314/10000 [00:00<00:21, 440.97it/s][A
  4%|▎         | 359/10000 [00:00<00:23, 417.84it/s][A
  4%|▍         | 408/10000 [00:00<00:21, 439.10it/s][A
  5%|▍         | 453/10000 [00:01<00:21, 435.87it/s][A
  5%|▍         | 497/10000 [00:01<00:22, 416.27it/s][A
  5%|▌         | 544/10000 [00:01<00:21, 431.60it/s][A
  6%|▌         | 592/10000 [00:01<00:21, 444.36it/s][A
  6%|▋         | 637/10000 [00:01<00:22, 424.88it/s][A
  7%|▋         | 681/10000 [00:01<00:21, 429.17it/s][A
  7%|▋         | 725/10000 [00:01<00:22, 421.38it/s][A
  8%|▊         | 768/10000 [00:01<00:22, 413.18it/s][A
  8%

In [166]:
train = pd.read_csv(DATA_PATH / 'raw' / 'clickbait' / 'train.csv')
test = pd.read_csv(DATA_PATH / 'raw' / 'clickbait' / 'test.csv')

train.replace(to_replace={'clickbait': 1., 'not-clickbait': 0.}, inplace=True)
test.replace(to_replace={'clickbait': 1., 'not-clickbait': 0.}, inplace=True)

# df = pd.concat([train, test])

X_train = train.title.progress_apply(tokenize)
X_test = test.title.progress_apply(tokenize)

y_train = train.label
y_test = test.label

X_train = torch.from_numpy(np.array(X_train.tolist()))
X_test = torch.from_numpy(np.array(X_test.tolist()))

y_train = torch.from_numpy(np.array(y_train.tolist()))
y_test = torch.from_numpy(np.array(y_test.tolist()))
# y = df.label
# y = y.progress_apply(lambda label: [label, 1 - label][::-1])

# X = np.array(X.tolist())
# y = np.array(y.tolist())
#
# X = torch.from_numpy(X)
# y = torch.from_numpy(y).float()


100%|██████████| 10000/10000 [00:02<00:00, 3642.96it/s]
100%|██████████| 50/50 [00:00<00:00, 3850.32it/s]


In [172]:
train_dataloader = DataLoader(TensorDataset(X_train, y_train), batch_size=32)
test_dataloader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

In [173]:
model = TextCNN(
    vocab_size=len(tokenizer.vectors),
    embedding_dim=tokenizer.vectors.vector_size,
    kernel_sizes=[10, 20, 30, 40],
    num_filters=100,
    num_classes=1,
    d_prob=0.5,
    embedding_weights=weights
)


criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)


In [174]:
step = Step(model, criterion, optimizer)
eval = Eval(model)

trainer = Engine(step)
train_evaluator = Engine(eval)
validation_evaluator = Engine(eval)

RunningAverage(output_transform=identity).attach(trainer, 'loss')

Accuracy(lambda output: (torch.round(output[0]), output[1])).attach(train_evaluator, 'accuracy')
Loss(criterion).attach(train_evaluator, 'bce')

Accuracy(lambda output: (torch.round(output[0]), output[1])).attach(validation_evaluator, 'accuracy')
Loss(criterion).attach(validation_evaluator, 'bce')

progress_bar = ProgressBar(persist=True, bar_format='')
progress_bar.attach(trainer, ['loss'])

In [175]:
trainer.run(train_dataloader, max_epochs=10)

  0%|          | 1/250 [00:00<?, ?it/s]

Engine run is terminating due to exception: 


KeyboardInterrupt: 

In [145]:
batch = next(iter(test_dataloader))
y_pred = model(batch[0])

In [146]:
from torchviz import make_dot

make_dot(y_pred, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png")

'rnn_torchviz.png'