In [14]:
from functools import partial

import keras
import pandas as pd
import numpy as np
import torch
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from keras import Sequential
from keras.layers import Conv1D, Flatten, Dense, Dropout, Input, Embedding, MaxPool1D

from nya_ml import embeddings
from nya_ml.preprocessing.tokenizer import Tokenizer
from nya_ml_research.config import MODELS_PATH, DATA_PATH

In [2]:
tqdm.pandas()

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
keyed_vectors = embeddings.get_source('ruwiki').load(MODELS_PATH / 'embeddings')

In [5]:
tokenizer = Tokenizer(keyed_vectors)
tokenize = partial(tokenizer.tokenize, to=list, pad=50)
vectorize = tokenizer.vectorize



In [6]:
# weights = embedding.get_torch_tensor_embeddings()
weights = torch.from_numpy(tokenizer.vectors.vectors)

In [17]:
limit = 10_000

df = pd.read_csv(DATA_PATH / 'raw' / 'ru-tweet-corp.csv', names=['text', 'label'], usecols=[4, 5])
df = shuffle(df)

X = df.text.head(limit)
y = df.label.head(limit)

X = X.progress_apply(tokenize)
# y = y.progress_apply(lambda label: [label, 1 - label][::-1])

X = np.array(X.tolist())
y = np.array(y.tolist())

# X = torch.from_numpy(X)
# y = torch.from_numpy(y).float()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

100%|██████████| 10000/10000 [00:20<00:00, 487.31it/s]


In [11]:
model = Sequential([
    Embedding(
        input_dim=len(tokenizer.vectors),
        output_dim=tokenizer.vectors.vector_size,
        input_length=50,
        weights=[weights],
        trainable=False
    ),

    Conv1D(
        filters=100,
        kernel_size=4,
        activation='relu'
    ),
    MaxPool1D(3),

    Flatten(),
    Dropout(0.5),

    Dense(100),
    Dropout(0.5),

    Dense(1, activation='softmax')
])

In [None]:
model

In [12]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 300)           74800200  
                                                                 
 conv1d_2 (Conv1D)           (None, 47, 100)           120100    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 15, 100)          0         
 1D)                                                             
                                                                 
 flatten_3 (Flatten)         (None, 1500)              0         
                                                                 
 dropout_3 (Dropout)         (None, 1500)              0         
                                                                 
 dense_2 (Dense)             (None, 100)               150100    
                                                      

In [13]:
model.compile(
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [18]:
model.fit(
    X, y,
    batch_size=32,
    epochs=10,
    validation_split=0.2,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1da0a286100>