In [136]:
!real

OSError: [Errno 12] Cannot allocate memory

In [130]:
import pandas as pd
from tokenize_uk import tokenize_uk
import io
import numpy as np
import pickle
import torch
from torch.utils.data import Dataset, DataLoader
import random
random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7fb744b2e050>

In [28]:
dataset_path = '/home/roman/tmp/nlp/PropaGANda/data/train.csv'
uk_vec_path = '/home/roman/Workspace/datasets/nlp/wiki.multi.uk.vec'
lemma_dict_path = '/home/roman/tmp/nlp/PropaGANda/data/lemma_dict.txt'
stopwords_path = '/home/roman/tmp/nlp/PropaGANda/data/stop_words_mini.txt'

In [118]:
max_words = 70

In [30]:
def get_lemma_dict(path):
    lemma_dict = dict()
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            l = line.split()
            lemma_dict[l[0]] = l[1]
    return lemma_dict

In [41]:
def get_stop_words(path):
    stop_words = set()
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            if line[0] != '*':
                stop_words.add(line.strip())
    return stop_words

In [85]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [31]:
ld = get_lemma_dict(lemma_dict_path)
print('Lemma dict loaded!')

In [42]:
stop_words = get_stop_words(stopwords_path)
print('Stop words loaded!')

In [None]:
embeddings, id2word, word2id = load_vec(uk_vec_path)
print('Embeddings loaded!')

In [None]:
df = pd.read_csv(path, ',')
print('Dataset loaded!')

In [119]:
def preprocess_sent(s, stop_words=stop_words, emb=embeddings, word2id=word2id, max_words=max_words):
    s = str(s).lower()
    words = tokenize_uk.tokenize_words(s)
    words = [word for word in words if word not in stop_words]
    words = [ld[word] if word in ld else word for word in words]
    words = [emb[word2id[word]] for word in words if word in word2id]
    words = np.array(words)
    if words.shape[0] > max_words:
        words = np.array([])
    return words

In [113]:
s = 'Я був в- душі, "Ямайка-Америка--"'
preprocess_sent(s, stop_words)

array([[-0.0610028 , -0.0311059 ,  0.0333089 , ..., -0.0149803 ,
         0.0772307 , -0.0560806 ],
       [-0.0467434 ,  0.0175629 ,  0.0067033 , ...,  0.0582691 ,
         0.0352814 ,  0.0113816 ],
       [-0.0606324 ,  0.0474398 , -0.0120805 , ...,  0.0679829 ,
        -0.0394429 , -0.0198681 ],
       ...,
       [-0.0610028 , -0.0311059 ,  0.0333089 , ..., -0.0149803 ,
         0.0772307 , -0.0560806 ],
       [-0.125168  ,  0.0184335 , -0.015915  , ...,  0.0171601 ,
        -0.102328  , -0.0502385 ],
       [ 0.0293562 ,  0.087323  ,  0.0138545 , ...,  0.0843766 ,
         0.0757897 , -0.00864521]])

In [114]:
len(df)

69983

In [115]:
content = []
tones = []

In [116]:
for text, tone in zip(df['text'], df['tone']):
    out = preprocess_sent(text)
    if out.shape[0] == 0: continue
    content.append(out)
    tones.append(int(tone) + 1)

(60186, 60186)

In [120]:
len(content), len(tones)

(60186, 60186)

In [121]:
data_dict = {'content': content, 'tones': tones}

In [124]:
with open('data_{}.pickle'.format(str(max_words)), 'wb') as handle:
    pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data_{}.pickle'.format(str(max_words)), 'rb') as handle:
    data_dict = pickle.load(handle)

In [126]:
content, tones = data_dict['content'], data_dict['tones']
len(content), len(tones)

(60186, 60186)

In [133]:
class UkSentimentDataset(Dataset):
    def __init__(self, content, tones):
        self.content = content
        self.tones = tones
        assert(len(self.content) == len(self.tones))
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.content[idx]), torch.LongTensor([self.tones[idx]])
    
    def __len__(self):
        return len(self.content)

In [134]:
l = len(content)
split = int(l * 0.8)
shuffled = list(zip(content, tones))
random.shuffle(shuffled)
content, tones = zip(*shuffled)

train_dataset = UkSentimentDataset(content[:split], tones[:split])
test_dataset = UkSentimentDataset(content[split:], tones[split:])