## Settings

In [1]:
need_to_encode = False # if you need to clean and encode texts
need_sample = False # if you need to sample some items of datasets

## Read model

In [2]:
import encoder

if (need_to_encode):
    model = encoder.parse_model('models/ruwikiruscorpora_upos_skipgram_300_2_2018.vec')
    
    print(len(model))

## Load and clean texts

In [3]:
import pickle

if (need_to_encode):
    train_texts = encoder.load_from_pickle('data/train_texts_cleaned_short.pickle')
    test_texts = encoder.load_from_pickle('data/test_texts_cleaned_short.pickle')
    
    print(f'{len(train_texts)} train texsts are loaded')
    print(f'{len(test_texts)} test texsts are loaded')

    cleaned_train_texts = list(map(lambda t: encoder.clean_text(t, model), train_texts))
    cleaned_test_texts = list(map(lambda t: encoder.clean_text(t, model), test_texts))
    
    print(f'{len(cleaned_train_texts)} train texts are cleaned')
    print(f'{len(cleaned_test_texts)} test texts are cleaned')

## Encode texts to vectors

In [4]:
if (need_to_encode):
    train_encoded = list(map(lambda t: encoder.encode_text(t, model, (256, 300)), cleaned_train_texts))
    test_encoded = list(map(lambda t: encoder.encode_text(t, model, (256, 300)), cleaned_test_texts))

    print(f'{len(train_encoded)} train texts are encoded')
    print(f'{len(test_encoded)} test texts are encoded')
    
    print(train_encoded[1])

    encoder.save_to_pickle(train_encoded, 'data/train_texts_cleaned_short_encoded.pickle')
    encoder.save_to_pickle(test_encoded, 'data/test_texts_cleaned_short_encoded.pickle')

## Load encoded data

In [5]:
train_labels = encoder.load_from_pickle('data/train_labels2.pickle')
train_encoded = encoder.load_from_pickle('data/train_texts_cleaned_short_encoded.pickle')
print(f'{len(train_labels)} train_labels are loaded')
print(f'{len(train_encoded)} train_encoded texts are loaded')

test_labels = encoder.load_from_pickle('data/val_labels2.pickle')
test_encoded = encoder.load_from_pickle('data/test_texts_cleaned_short_encoded.pickle')
print(f'{len(test_labels)} test labels are loaded')
print(f'{len(test_encoded)} test encoded texts are loaded')

4492 train_labels are loaded
4492 train_encoded texts are loaded
1000 test labels are loaded
1000 test encoded texts are loaded


## Sample items

In [6]:
import random
if (need_sample):
    train_number = 260
    test_number = 30

    train_sample = []
    train_labels_sample = []
    test_sample = []
    test_labels_sample = []

    for i in range(0, train_number):
        j = random.randint(0, 4491)
        train_sample.append(train_encoded[j])
        train_labels_sample.append(train_labels[j])

    for i in range(0, test_number):
        j = random.randint(0, 999)
        test_sample.append(test_encoded[j])
        test_labels_sample.append(test_labels[j])

    train_encoded = train_sample
    train_labels = train_labels_sample
    test_encoded = test_sample
    test_labels = test_labels_sample

## Convert data to PyTorch

In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_tensors_x = torch.FloatTensor(train_encoded).view(4492, 1, 256, 300)
train_tensors_y = torch.FloatTensor(train_labels)
test_tensors_x = torch.FloatTensor(test_encoded).view(1000, 1, 256, 300)
test_tensors_y = torch.FloatTensor(test_labels)

train_dataset = TensorDataset(train_tensors_x, train_tensors_y)
test_dataset = TensorDataset(test_tensors_x, test_tensors_y)

print(f'{len(train_dataset)} train encoded texts are converted to tensors')
print(f'{len(test_dataset)} test encoded texts are converted to tensors')

260 train encoded texts are converted to tensors
30 test encoded texts are converted to tensors


## Split data to train and val

In [8]:
from torch.utils.data import random_split

#I took train and val sets in a ratio of 7 to 3
train, val = random_split(train_dataset, [3992, 500])

train_loader = DataLoader(train, batch_size=4)
val_loader = DataLoader(val, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

## Setup and train model

In [9]:
from models.convnet import ConvNet
from model_utils import train
import torch.optim

#[conv-relu-conv-relu-pool]xN -> [affine]xM -> [softmax or SVM]
im_size = (256, 300, 1)
conv_params = [(256, 7, 2), (512, 5, 2)]
linear_params = [32, 2]
learning_rate = 5e-2

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('using device:', device)
    
model = ConvNet(im_size, conv_params, linear_params)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)

train(model, train_loader, val_loader, optimizer, device, epochs=15)

using device: cuda
Iteration 0, loss = 0.6623
Got 33 / 60 correct (55.00)

Iteration 0, loss = 0.7089
Got 30 / 60 correct (50.00)

Iteration 0, loss = 0.6948
Got 40 / 60 correct (66.67)

Iteration 0, loss = 0.8881
Got 34 / 60 correct (56.67)

Iteration 0, loss = 0.6403
Got 28 / 60 correct (46.67)

Iteration 0, loss = 0.6613
Got 42 / 60 correct (70.00)

Iteration 0, loss = 0.5736
Got 41 / 60 correct (68.33)

Iteration 0, loss = 0.4975
Got 34 / 60 correct (56.67)

Iteration 0, loss = 0.2560
Got 43 / 60 correct (71.67)

Iteration 0, loss = 0.1270
Got 39 / 60 correct (65.00)

Iteration 0, loss = 0.0981
Got 42 / 60 correct (70.00)

Iteration 0, loss = 0.1447
Got 42 / 60 correct (70.00)

Iteration 0, loss = 0.0646
Got 42 / 60 correct (70.00)

Iteration 0, loss = 0.0357
Got 40 / 60 correct (66.67)

Iteration 0, loss = 0.0421
Got 40 / 60 correct (66.67)



## Test model

In [10]:
from model_utils import eval_model

pred, groundtruth = eval_model(val_loader, model, device)

## Evaluation

In [11]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

print(f'accuracy: {accuracy_score(pred, groundtruth):.3f}')
print(f'f1 score: {f1_score(pred, groundtruth):.3f}')
print(f'recall: {recall_score(pred, groundtruth):.3f}')
print(f'precision: {precision_score(pred, groundtruth):.3f}')

accuracy: 0.717
f1 score: 0.691
recall: 0.679
precision: 0.704
