In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torch.nn import functional as F

from torchnlp.word_to_vector import FastText


import json
import re


In [8]:
# df = pd.read_csv('../data/lenta-ru-news.csv')
# with open('../data/lenta-texts.json', 'w') as f:
#     json.dump([{'text':i} for i in df.text], f)

In [2]:
def preprocess(text):
    return ' '.join(re.findall(r'[А-я]+', text))

In [8]:
def pad(vectors, pad_len):
    vectors = vectors[:pad_len]
    return np.row_stack([vectors, np.zeros((pad_len - len(vectors), 300))])

In [9]:
def clear_data(json_texts):
    texts = []
    for text in json_texts:
        try:
            preprocessed = preprocess(text['text'])
        except TypeError as e:
            continue
        if len(preprocessed.split()) > 100:
            preprocessed = ' '.join([i for i in preprocessed.split() if len(i) > 3])
        try:
            text = vectorize(preprocessed)
        except ValueError as e:
            continue
        text = pad(text, 60)
        texts.append(text)
    return np.stack(texts, axis=0)

In [10]:
class FasttextSet(Dataset):
    def __init__(self, cosmo, nocosmo):
        '''
        data - json file name
        '''
        with open(cosmo, 'r') as f:
            self.cosmo = json.load(f)
        self.cosmo = clear_data(self.cosmo)
        with open(nocosmo, 'r') as f:
            self.nocosmo = json.load(f)[:100000]
        self.nocosmo = clear_data(self.nocosmo)
    
    def __len__(self):
        return int(5e4)
    
    def __getitem__(self, ind):
        
        if ind%2 == 0:
            return self.cosmo[np.random.randint(len(self.cosmo))], self.cosmo[np.random.randint(len(self.cosmo))], 1
        return self.cosmo[np.random.randint(len(self.cosmo))], self.nocosmo[np.random.randint(len(self.nocosmo))], 0


In [None]:
memLoader = DataLoader(FasttextSet('../data/mem_big.json', '../data/lenta-texts.json'), batch_size=32, shuffle=True)

In [91]:
class SimilarityNet(nn.Module):
    def __init__(self, ):
        super(SimilarityNet, self).__init__()
        self.process = nn.Sequential(
            nn.Conv1d(300, 256, 15, padding=7),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.05),
            nn.Conv1d(256, 128, 9, padding=4),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.05),
            nn.Conv1d(128, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.05),
        )
        
        self.compare = nn.Sequential(
            nn.Conv1d(128, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.05),
            nn.Conv1d(64, 32, 3, padding=1),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.05),
            nn.Conv1d(32, 1, 3, padding=1),
        )
        
    def forward(self, x1, x2):
        x1 = self.process(x1)
        x2 = self.process(x2)
        x = torch.cat([x1, x2], dim=1)
        x = torch.sigmoid(self.compare(x))
        return x

In [92]:
simnet = SimilarityNet().float()

In [93]:
batch = next(iter(memLoader)).permute(0, 2, 1).float()

In [94]:
simnet(batch, batch).shape

torch.Size([32, 1, 60])