In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torch.nn import functional as F

from torchnlp.word_to_vector import FastText


import json
import re

import numpy as np


In [2]:
# !pip3 install gensim

In [26]:
# mem = []
# for line in open('../data/mem.json', 'r'):
#     mem.append(json.loads(line))


In [35]:
def preprocess(text):
    return ' '.join([i for i in re.findall(r'[А-я]+', text) if len(i) > 3])

In [29]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('../data/cc.ru.300.bin')

In [36]:
def vectorize(text):
    text = text.split(' ')
    return np.row_stack([model.wv[word] for word in text])

In [59]:
def pad(vectors, pad_len):
    vectors = vectors[:pad_len]
    return np.row_stack([vectors, np.zeros((pad_len - len(vectors), 300))])

In [60]:
class FasttextSet(Dataset):
    def __init__(self, data):
        '''
        data - json file name
        '''
        with open(data, 'r') as f:
            self.texts = json.load(f)
        self.texts = [preprocess(text['text']) for text in self.texts]
        self.texts = [vectorize(text) for text in self.texts]
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, ind):
        return pad(self.texts[ind], 60)

In [61]:
memLoader = DataLoader(FasttextSet('../data/mem_smol.json'), batch_size=32, shuffle=True)

In [91]:
class SimilarityNet(nn.Module):
    def __init__(self, ):
        super(SimilarityNet, self).__init__()
        self.process = nn.Sequential(
            nn.Conv1d(300, 256, 15, padding=7),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.05),
            nn.Conv1d(256, 128, 9, padding=4),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.05),
            nn.Conv1d(128, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.05),
        )
        
        self.compare = nn.Sequential(
            nn.Conv1d(128, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.05),
            nn.Conv1d(64, 32, 3, padding=1),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.05),
            nn.Conv1d(32, 1, 3, padding=1),
        )
        
    def forward(self, x1, x2):
        x1 = self.process(x1)
        x2 = self.process(x2)
        x = torch.cat([x1, x2], dim=1)
        x = torch.sigmoid(self.compare(x))
        return x

In [92]:
simnet = SimilarityNet().float()

In [93]:
batch = next(iter(memLoader)).permute(0, 2, 1).float()

In [94]:
simnet(batch, batch).shape

torch.Size([32, 1, 60])