In [43]:
import re
import nltk
import numpy

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder


from torch.autograd import Variable

In [3]:
import torch.nn.functional as F
import numpy as np

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Представление и предобработка текстовых данных

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [7]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [8]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [9]:
# токенизация
word_tokenize_list = word_tokenize(text)
word_tokenize_list

['Select',
 'your',
 'preferences',
 'and',
 'run',
 'the',
 'install',
 'command',
 '.',
 'Stable',
 'represents',
 'the',
 'most',
 'currently',
 'tested',
 'and',
 'supported',
 'version',
 'of',
 'PyTorch',
 '.',
 'Note',
 'that',
 'LibTorch',
 'is',
 'only',
 'available',
 'for',
 'C++']

In [10]:
# стемминг / лемматизация + приведение к нижнему регистру
stemmer = SnowballStemmer("english")
stemmed_words = [stemmer.stem(word) for word in word_tokenize_list]
stemmed_words

['select',
 'your',
 'prefer',
 'and',
 'run',
 'the',
 'instal',
 'command',
 '.',
 'stabl',
 'repres',
 'the',
 'most',
 'current',
 'test',
 'and',
 'support',
 'version',
 'of',
 'pytorch',
 '.',
 'note',
 'that',
 'libtorch',
 'is',
 'onli',
 'avail',
 'for',
 'c++']

In [11]:
from string import punctuation

# удаление стоп-слов + удаление пунктуации
stopWords = stopwords.words('english') + list(punctuation)
wordsFiltered = [word for word in stemmed_words if word not in stopWords]
wordsFiltered

['select',
 'prefer',
 'run',
 'instal',
 'command',
 'stabl',
 'repres',
 'current',
 'test',
 'support',
 'version',
 'pytorch',
 'note',
 'libtorch',
 'onli',
 'avail',
 'c++']

Реализовать функцию `preprocess_text(text: str)`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [12]:
def preprocess_text(text: str):
  ''' Приводит строку к нижнему регистру
      Заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел '''
  return re.sub("[^A-Za-z\.,!? \-']", ' ', text.lower())

In [13]:
text1 = "Hello world! What's up? It's very nice to see you, is't it? Definetely. 55"
print(text1)
print(preprocess_text(text1))

Hello world! What's up? It's very nice to see you, is't it? Definetely. 55
hello world! what's up? it's very nice to see you, is't it? definetely.   


1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [14]:
tokenized_text = sent_tokenize(text)
first_sentence = tokenized_text[0]
first_sentence

'Select your preferences and run the install command.'

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(tokenized_text)
f_s_cv = cv.transform([first_sentence]).toarray()
f_s_cv

array([[1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 1]])

In [16]:
# просто ради интереса вывести словарь вокабуляр
# cv.vocabulary_

In [17]:
# сам тензор
sentence_t = torch.FloatTensor(f_s_cv[0])
sentence_t

tensor([1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
        0., 0., 0., 1., 0., 1.])

## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [15]:
# 3.1 Считать файл yelp/raw_train.csv.
df_ = pd.read_csv('/content/drive/MyDrive/ML/5/yelp/raw_train.csv', header=None, names=['target', 'review'])
df_.head()

Unnamed: 0,target,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [16]:
# Оставить от исходного датасета 10% строчек.
df = df_.sample(int(df_.shape[0] * 0.1))
df.shape

(56000, 2)

In [17]:
# 3.2 Воспользоваться функцией preprocess_text из 1.1 для обработки текста отзыва.
df['review'] = df['review'].apply(preprocess_text)
df['review'] = df['review'].apply(word_tokenize)
stopWords = stopwords.words('english') + list(punctuation) + ["'t","'s", "'re"]
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stopWords])
stemmer = SnowballStemmer("english")
df['review'] = df['review'].apply(lambda x: [stemmer.stem(word) for word in x])

In [18]:
# Закодировать рейтинг числами, начиная с 0.
le = LabelEncoder()
le.fit(df['target'])
df['target'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,review
472715,1,"[sju, las, vega, one, stop, atalanta, time, la..."
205361,1,"[shop, sun, valli, quilt, often, stop, jose, r..."
260885,0,"['m, sorri, nsat, night, around, one, person, ..."
536885,0,"[stay, away, eater, bewar, wife, order, meal, ..."
10378,0,"[experi, move, barr, apart, first, show, apart..."


In [19]:
df.to_csv('/content/drive/MyDrive/ML/5/yelp/preprocessed.csv')

In [20]:
# 3.3 Разбить датасет на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['target'],
                                                    train_size=0.85,
                                                    random_state=42)

In [21]:
# 3.4 Реализовать класс Vocab (токен = слово)
class Vocab:
  def __init__(self, data):
    self.token_to_idx = {}
    self.idx_to_token = {}
    for line in data:
      for word in list(line):
        if word not in self.token_to_idx:
          self.token_to_idx[word] = len(self.token_to_idx)
          self.idx_to_token[len(self.token_to_idx)] = word

    self.vocab_len = len(self.token_to_idx)

In [22]:
# 3.5 Реализовать класс ReviewDataset
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X.to_numpy()
    self.y = y.to_numpy()
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(1,vocab.vocab_len)
    for word in review:
        tensor[0][vocab.token_to_idx[word]] = 1
    return tensor

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X = self.X[idx]
    y = self.y[idx]
    return X, y

In [23]:
df = pd.read_csv('/content/drive/MyDrive/ML/5/yelp/preprocessed.csv', index_col=[0])
df.head()

Unnamed: 0,target,review
472715,1,"['sju', 'las', 'vega', 'one', 'stop', 'atalant..."
205361,1,"['shop', 'sun', 'valli', 'quilt', 'often', 'st..."
260885,0,"[""'m"", 'sorri', 'nsat', 'night', 'around', 'on..."
536885,0,"['stay', 'away', 'eater', 'bewar', 'wife', 'or..."
10378,0,"['experi', 'move', 'barr', 'apart', 'first', '..."


In [24]:
from ast import literal_eval

df['review'] = df['review'].apply(literal_eval)
vocab = Vocab(df['review'])
vocab.vocab_len

63132

In [None]:
%%time
train_data = ReviewDataset(X_train, y_train, vocab)
train_data.X = X_train.apply(train_data.vectorize)
train_data.X = train_data.X.to_numpy()
test_data = ReviewDataset(X_test, y_train, vocab)
test_data.X = X_test.apply(test_data.vectorize).to_numpy()

In [None]:
batch_size = 100
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
class ReviewClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, hidden_size):
        super(ReviewClassifier, self).__init__()
        self.linear1 = nn.Linear(vocab_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size * 2)
        self.linear3 = nn.Linear(hidden_size, num_labels)
        self.softmax = nn.LogSoftmax()
        self.relu = nn.ReLU()

    def forward(self, inputs):
        lin1 = self.relu(self.linear1(inputs))
        lin2 = self.relu(self.linear2(lin1))
        return self.linear2(lin2)

In [None]:
# 3.6. Обучить классификатор.

vocab_size = vocab.vocab_len
num_labels = 512
hidden_size = 4
lr=0.005

model = ReviewClassifier(num_labels, vocab_size, hidden_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

log_interval = 5
n_epoch = 100 + 1
losses_list = []

start = time.time()
for epoch in range(1, n_epoch):
    model.train()
    running_loss = 0
    for data, target in train_dataloader:
        model.zero_grad()
        predict = model(data)
        loss = loss_function(predict, target)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    losses = running_loss/len(train_dataloader)
    if epoch % log_interval == 0:
        print("Epoch: {}/{} |".format(epoch, n_epoch-1),
              "{}|".format(time_since(start)),
              "Training loss: {:.3f}..".format(losses))

In [None]:
# vocab_size = vocab.vocab_len
# num_labels = 512
# hidden_size = 4
# lr=0.0075

# model = ReviewClassifier(num_labels, vocab_size, hidden_size)
# # loss_function = nn.NLLLoss()
# loss_function = nn.NLLLoss()
# optimizer = optim.Adagrad(model.parameters(), lr=lr)

# log_interval = 10
# n_epoch = 500
# losses_list = []

# for epoch in range(n_epoch + 1):
#     losses = []
#     if epoch > 300:
#         lr=0.0025
#     optimizer = optim.Adam(model.parameters(), lr=lr)
#     for data, target in train_dataloader:
#         model.zero_grad()
#         predict = model(data.view(data.shape[0], -1))
#         loss = loss_function(predict, target)
#         losses.append(loss.item())
#         loss.backward()
#         optimizer.step()
#     losses_list.append(np.mean(losses))
#     if epoch % log_interval == 0:
#         print("Epoch: {}/{} |".format(epoch, n_epoch),
#                   "Training loss: {:.3f}..".format(np.mean(losses_list)))

In [None]:
# Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать
# небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса
# (сделать это для явно позитивного и явно негативного отзыва)

losses_t = []
correct = 0
sum = 0
for data, target in test_dataloader:
    # model.no_grad()
    predict = model(data.view(data.shape[0], -1))
    # print(target)
    # correct += predict.eq(target.view_as(predict)).sum().item()
    for idx in range(data.size()[0]):
        if predict[idx].argmax().item() == target[idx].item() :
          correct += 1
        sum += 1
    loss = loss_function(predict, target)
    losses_t.append(loss.item())


print("Test loss: {:.3f}..".format(np.mean(losses_t)))
print(f'{correct} / {sum}, {correct / sum * 100:.3}%')

# print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
#     np.mean(losses_t), correct, len(test_data.dataset), 100. * correct / len(test_data.dataset)))

Test loss: 37.024..
412 / 840, 49.0%


  return self.softmax(lin2)


In [None]:
# 3.1 Считать файл yelp/raw_train.csv.
sample = df.sample(10)
sample

Unnamed: 0,target,review
19292,0,"[tri, salon, see, want, use, hair, makeup, bri..."
128323,1,"[amaz, n, nmi, batteri, die, guy, believ, name..."
337817,1,"[mango, smoothi, quit, delici, barista, alway,..."
421979,1,"[realli, nice, guy, great, communication-, imp..."
353820,1,[busi]
78207,1,"[drop, hole, wall, whim, drive, home, pam, soo..."
503775,0,"[truli, disappoint, ..., remind, poor, thrown,..."
453874,0,"[n't, bother, went, dinner, place, almost, emp..."
352911,0,"[pedicur, receiv, worst, ever, add, insult, in..."
397002,0,"[wow, believ, poor, servic, receiv, john, plac..."


In [None]:
df.target.unique()

array([1, 0])

In [None]:
d = ReviewDataset(sample['review'], sample['target'], vocab)
d.X = sample['review'].apply(d.vectorize).to_numpy()
dataloader = DataLoader(d, batch_size=d.__len__())

for data, target in dataloader:
    predict = model(data)
    for idx in range(10):
        print(sample.iloc[idx]['target'], le.classes_[predict[idx].argmax().item()])

0 0
1 1
1 1
1 1
1 0
1 1
0 0
0 0
0 0
0 0


## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [44]:
# 3.1 Считать файл yelp/raw_train.csv.
df_ = pd.read_csv('/content/drive/MyDrive/ML/5/yelp/raw_train.csv', header=None, names=['target', 'review'])
df_.head()

Unnamed: 0,target,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [45]:
# Оставить от исходного датасета 10% строчек.
df = df_.sample(int(df_.shape[0] * 0.1))
df.shape

(56000, 2)

In [46]:
# 3.2 Воспользоваться функцией preprocess_text из 1.1 для обработки текста отзыва.
def preprocess_text_v2(text: str):
  stopWords = stopwords.words('english') + list(punctuation)
  words = text.split()
  noStopWords = [word for word in words if word not in stopWords]
  return " ".join([stemmer.stem(word) for word in noStopWords])


df['review'] = df['review'].apply(preprocess_text)
df['review'] = df['review'].apply(preprocess_text_v2)
df['review'].dropna(inplace=True)

# df['review'] = df['review'].apply(word_tokenize)

# stopWords = stopwords.words('english') + list(punctuation) + ["'t","'s", "'re"]
# df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stopWords])

# stemmer = SnowballStemmer("english")
# df['review'] = df['review'].apply(lambda x: [stemmer.stem(word) for word in x])

In [47]:
# Закодировать рейтинг числами, начиная с 0.

le = LabelEncoder()
le.fit(df['target'])
df['target'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,review
301789,1,i'v tri typic breakfast places... one way bett...
138968,1,husband came recent thought great deal love lo...
164812,1,sophist beauti setting. high recommend white s...
53649,1,"time fav pho restaurant, authentic, mountain l..."
538171,0,failur custom servic locat taste. complet fail...


In [48]:
df.to_csv('/content/drive/MyDrive/ML/5/yelp/preprocessed.csv')

In [66]:
# 3.4 Реализовать класс Vocab (токен = слово)

class Vocab:
  def __init__(self, data):
    self.token_to_idx = {}
    self.idx_to_token = {}
    for line in data:
      if type(line) == str:
        line = word_tokenize(line)
        for word in line:
          if word not in self.token_to_idx:
            self.token_to_idx[word] = len(self.token_to_idx)
            self.idx_to_token[len(self.token_to_idx)] = word
    self.vocab_len = len(self.token_to_idx)

In [67]:
# 3.5 Реализовать класс ReviewDataset
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(vocab.vocab_len)
    review_list = word_tokenize(review)
    for word in review_list:
        tensor[vocab.token_to_idx[word]] = 1
    return tensor

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X = self.vectorize(self.X[idx])
    y = self.y[idx]
    return X, y

In [68]:
df = pd.read_csv('/content/drive/MyDrive/ML/5/yelp/preprocessed.csv', index_col=[0])
df.head()

Unnamed: 0,target,review
301789,1,i'v tri typic breakfast places... one way bett...
138968,1,husband came recent thought great deal love lo...
164812,1,sophist beauti setting. high recommend white s...
53649,1,"time fav pho restaurant, authentic, mountain l..."
538171,0,failur custom servic locat taste. complet fail...


In [78]:
df = df.dropna()

In [79]:
# 3.3 Разбить датасет на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['target'],
                                                    train_size=0.85,
                                                    random_state=42)

In [107]:
%%time
vocab = Vocab(df['review'])
train_data = ReviewDataset(X_train.values, y_train.values, vocab)
test_data = ReviewDataset(X_test.values, y_test.values, vocab)

CPU times: user 42.9 s, sys: 70.4 ms, total: 43 s
Wall time: 44.9 s


In [108]:
batch_size = 30
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [111]:
# 3.6 Обучить классификатор
class ReviewClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, hidden_size):
        super(ReviewClassifier, self).__init__()
        self.linear1 = nn.Linear(vocab_size, hidden_size * 2)
        self.linear2 = nn.Linear(hidden_size * 2, hidden_size)
        self.linear3 = nn.Linear(hidden_size, num_labels)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        lin1 = self.relu(self.linear1(inputs))
        lin2 = self.relu(self.linear2(lin1))
        return self.linear3(lin2)

In [83]:
import time

def time_since(since):
    now = time.time()
    s = now - since
    m = np.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [118]:
vocab_size = vocab.vocab_len
num_labels = 3
hidden_size = 128
lr=0.005

model = ReviewClassifier(num_labels, vocab_size, hidden_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

log_interval = 1
n_epoch = 4
losses_list = []

start = time.time()

model.train()
for epoch in range(1, n_epoch + 1):
    losses = []
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for idx, (data, target) in enumerate(train_dataloader):
        model.zero_grad()
        predict = model(data)
        loss = loss_function(predict, target)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    losses_list.append(np.mean(losses))
    if epoch % log_interval == 0:
        print("Epoch: {}/{} ({:.2f})%|".format(epoch, n_epoch, epoch / n_epoch * 100),
              "{}|".format(time_since(start)),
                  "Training loss: {:.3f}..".format(np.mean(losses_list)))

Epoch: 1/4 (25.00)%| 9m 0s| Training loss: 0.264..
Epoch: 2/4 (50.00)%| 18m 27s| Training loss: 0.197..
Epoch: 3/4 (75.00)%| 28m 30s| Training loss: 0.152..
Epoch: 4/4 (100.00)%| 38m 34s| Training loss: 0.123..


In [119]:
torch.save(model.state_dict(), 'model_review.pt')

In [120]:
# 3.7 Измерить точность на тестовой выборке.

losses_t = []
correct = 0
sum = 0
for idx_, (data, target) in enumerate(test_dataloader):
    # model.no_grad()
    predict = model(data)
    for idx in range(data.size()[0]):
        if predict[idx].topk(1)[1].item() == target[idx].item():
          correct += 1
    sum += data.size()[0]
    loss = loss_function(predict, target)
    losses_t.append(loss.item())

print("Test loss: {:.3f}..".format(np.mean(losses_t)))
print(f'{correct} / {sum}, {correct / sum * 100:.3}%')

Test loss: 0.871..
7537 / 8400, 89.7%


In [128]:
# Проверить работоспособность модели: придумать небольшой отзыв,
# прогнать его через модель и вывести номер предсказанного класса
#  (сделать это для явно позитивного и явно негативного отзыва)
# 0 - плохо
# 1 - хорошо

bad_revs = ['unhealthy food', 'disgusting view']
good_revs = ['I love this place, best cupcakes', 'amazing view']
pos_neg = ['bad', 'good']

for rev in bad_revs:
    pred_ = model(train_data.vectorize(preprocess_text_v2(rev))).topk(1)[1].item()
    print(f'{rev} (bad review) => (predicted) {pos_neg[pred_]}')

for rev in good_revs:
    pred_ = model(train_data.vectorize(preprocess_text_v2(rev))).topk(1)[1].item()
    print(f'{rev} (good review) => (predicted) {pos_neg[pred_]}')

unhealthy food (bad review) => (predicted) bad
disgusting view (bad review) => (predicted) bad
I love this place, best cupcakes (good review) => (predicted) good
amazing view (good review) => (predicted) good
