In [None]:
import re
import nltk
import numpy

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder


from torch.autograd import Variable

In [None]:
import torch.nn.functional as F
import numpy as np

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Представление и предобработка текстовых данных

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [None]:
# токенизация
word_tokenize_list = word_tokenize(text)
word_tokenize_list

['Select',
 'your',
 'preferences',
 'and',
 'run',
 'the',
 'install',
 'command',
 '.',
 'Stable',
 'represents',
 'the',
 'most',
 'currently',
 'tested',
 'and',
 'supported',
 'version',
 'of',
 'PyTorch',
 '.',
 'Note',
 'that',
 'LibTorch',
 'is',
 'only',
 'available',
 'for',
 'C++']

In [None]:
# стемминг / лемматизация + приведение к нижнему регистру
stemmer = SnowballStemmer("english")
stemmed_words = [stemmer.stem(word) for word in word_tokenize_list]
stemmed_words

['select',
 'your',
 'prefer',
 'and',
 'run',
 'the',
 'instal',
 'command',
 '.',
 'stabl',
 'repres',
 'the',
 'most',
 'current',
 'test',
 'and',
 'support',
 'version',
 'of',
 'pytorch',
 '.',
 'note',
 'that',
 'libtorch',
 'is',
 'onli',
 'avail',
 'for',
 'c++']

In [None]:
from string import punctuation

# удаление стоп-слов + удаление пунктуации
stopWords = stopwords.words('english') + list(punctuation)
wordsFiltered = [word for word in stemmed_words if word not in stopWords]
wordsFiltered

['select',
 'prefer',
 'run',
 'instal',
 'command',
 'stabl',
 'repres',
 'current',
 'test',
 'support',
 'version',
 'pytorch',
 'note',
 'libtorch',
 'onli',
 'avail',
 'c++']

Реализовать функцию `preprocess_text(text: str)`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [None]:
def preprocess_text(text: str):
  ''' Приводит строку к нижнему регистру
      Заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел '''
  return re.sub("[^A-Za-z\.,!? \-']", ' ', text.lower())

In [None]:
text1 = "Hello world! What's up? It's very nice to see you, is't it? Definetely. 55"
print(text1)
print(preprocess_text(text1))

Hello world! What's up? It's very nice to see you, is't it? Definetely. 55
hello world! what's up? it's very nice to see you, is't it? definetely.   


1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [None]:
tokenized_text = sent_tokenize(text)
first_sentence = tokenized_text[0]
first_sentence

'Select your preferences and run the install command.'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(tokenized_text)
f_s_cv = cv.transform([first_sentence]).toarray()
f_s_cv

array([[1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 1]])

In [None]:
# просто ради интереса вывести словарь вокабуляр
# cv.vocabulary_

In [None]:
# сам тензор
sentence_t = torch.FloatTensor(f_s_cv[0])
sentence_t

tensor([1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
        0., 0., 0., 1., 0., 1.])

## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`.

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [None]:
# 2.1 Считать файл surnames/surnames.csv.
df = pd.read_csv('/content/drive/MyDrive/ML/5/surnames.csv')
df.head(10)

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian
5,Obinata,Japanese
6,Rahal,Arabic
7,Zhuan,Chinese
8,Acconci,Italian
9,Mifsud,Arabic


In [None]:
# 2.2 Закодировать национальности числами, начиная с 0.

le = LabelEncoder()
df['nat'] = df['nationality']
df['nationality'] = df.apply(le.fit_transform)['nationality']
df.head()

Unnamed: 0,surname,nationality,nat
0,Woodford,4,English
1,Coté,5,French
2,Kore,4,English
3,Koury,0,Arabic
4,Lebzak,14,Russian


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10980 entries, 0 to 10979
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   surname      10980 non-null  object
 1   nationality  10980 non-null  int64 
 2   nat          10980 non-null  object
dtypes: int64(1), object(2)
memory usage: 257.5+ KB


In [None]:
# 2.3 Разбить датасет на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(df['surname'], df['nationality'],
                                                    train_size=0.85,
                                                    random_state=42)

In [None]:
# # 2.4 Реализовать класс Vocab (токен = символ)
class Vocab:
  def __init__(self, data):
    self.token_to_idx = {}
    self.idx_to_token = {} # ?
    for word in data:
      for letter in word.lower():
        if letter not in self.token_to_idx:
          self.token_to_idx[letter] = len(self.token_to_idx)
          self.idx_to_token[len(self.token_to_idx)] = letter # ???

    self.vocab_len = len(self.token_to_idx)

In [None]:
# 2.5 Реализовать класс SurnamesDataset
class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X.copy()
    self.y = y.copy().to_numpy()
    self.vocab = vocab


    self.X = self.X.apply(self.vectorize).to_numpy()

  def vectorize(self, surname):
    '''Генерирует представление фамилии surname в при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(20, 1, vocab.vocab_len)
    skew = 20 - len(surname)
    for i, letter in enumerate(surname.lower()):
        tensor[i + skew][0][vocab.token_to_idx[letter]] = 1
    return tensor


  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X = self.X[idx]
    y = self.y[idx]
    return X, y

In [None]:
vocab = Vocab(df['surname'])
train_data = SurnamesDataset(X_train, y_train, vocab)
test_data = SurnamesDataset(X_test, y_test, vocab)

batch_size = 25
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
class SurnameClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, hidden_size):
        super(SurnameClassifier, self).__init__()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear( vocab_size*20, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_labels)
        self.tanh = nn.Tanh()
        self.log_sigmoid = nn.LogSigmoid()
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        lin1 = self.linear1(self.flatten(inputs))
        hid = self.tanh(lin1)
        lin2 = self.linear2(hid)
        log_probs = self.log_softmax(lin2)
        return log_probs

In [None]:
import time

def time_since(since):
    now = time.time()
    s = now - since
    m = np.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [None]:
# 2.6. Обучить классификатор.

vocab_size = vocab.vocab_len
num_labels = len(le.classes_)
hidden_size = 128
lr=0.005

model = SurnameClassifier(num_labels, vocab_size, hidden_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

log_interval = 100
n_epoch = 1_000 + 1
losses_list = []

start = time.time()
for epoch in range(1, n_epoch):
    model.train()
    running_loss = 0
    for data, target in train_dataloader:
        model.zero_grad()
        predict = model(data)
        loss = loss_function(predict, target)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    losses = running_loss/len(train_dataloader)
    if epoch % log_interval == 0:
        print("Epoch: {}/{} |".format(epoch, n_epoch-1),
              "{}|".format(time_since(start)),
              "Training loss: {:.3f}..".format(losses))

Epoch: 100/1000 | 0m 52s| Training loss: 0.540..
Epoch: 200/1000 | 1m 44s| Training loss: 0.221..
Epoch: 300/1000 | 2m 34s| Training loss: 0.105..
Epoch: 400/1000 | 3m 24s| Training loss: 0.077..
Epoch: 500/1000 | 4m 14s| Training loss: 0.069..
Epoch: 600/1000 | 5m 4s| Training loss: 0.064..
Epoch: 700/1000 | 5m 55s| Training loss: 0.061..
Epoch: 800/1000 | 6m 44s| Training loss: 0.059..
Epoch: 900/1000 | 7m 34s| Training loss: 0.058..
Epoch: 1000/1000 | 8m 24s| Training loss: 0.057..


In [None]:
# 2.7 Измерить точность на тестовой выборке.

losses_t = []
correct = 0
sum = 0
for data, target in test_dataloader:
    model.no_grad()
    predict = model(data.view(data.shape[0], -1))
    for idx in range(data.size()[0]):
        if predict[idx].argmax().item() == target[idx].item() :
          correct += 1
        sum += 1
    loss = loss_function(predict, target)
    losses_t.append(loss.item())


print("Test loss: {:.3f}..".format(np.mean(losses_t)))
print(f'{correct} / {sum}, {correct / sum * 100:.3}%')

Test loss: 1.939..
1180 / 1647, 71.6%


In [None]:
sample = df.sample(5)
d = SurnamesDataset(sample['surname'], sample['nationality'], vocab)
dataloader = DataLoader(d, batch_size=d.__len__())

for data, target in dataloader:
    predict = model(data.view(data.shape[0], -1))
    for idx in range(5):
        print(f'Surname: {sample.iloc[idx]["surname"]} ({sample.iloc[idx]["nat"]})')
        pred_surnames = torch.topk(predict[idx], 3).indices.tolist()
        pred_surnames_ = ', '.join([le.classes_[i] for i in pred_surnames[1:]])
        print(f'predicted nationality: {le.classes_[pred_surnames[0]]} (or {pred_surnames_})\n')

Surname: Zholtovsky (Russian)
predicted nationality: Russian (or English, Czech)

Surname: Agadjanyan (Russian)
predicted nationality: Russian (or English, Irish)

Surname: Baldini (Italian)
predicted nationality: Italian (or Russian, Polish)

Surname: Sturrock (English)
predicted nationality: English (or Czech, Polish)

Surname: Mihalevich (Russian)
predicted nationality: Russian (or German, Dutch)



In [None]:
# Проверить работоспособность модели: прогнать несколько фамилий студентов группы
# через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.
my_ows_surnames = pd.DataFrame([{'surname':'Kamenchuk', 'nationality':'Russian'},
                                {'surname':'Gladkij', 'nationality':'Russian'},
                                {'surname':'Alkhazha', 'nationality':'Arabic'}])

my_ows_surnames['nat'] = my_ows_surnames['nationality'].apply(lambda x: le.transform([x])[0])
d = SurnamesDataset(my_ows_surnames['surname'], my_ows_surnames['nationality'], vocab)
dataloader = DataLoader(d, batch_size=d.__len__())

for data, target in dataloader:
    predict = model(data.view(data.shape[0], -1))
    for idx in range(3):
        print(f'Surname: {my_ows_surnames.iloc[idx]["surname"]} ({my_ows_surnames.iloc[idx]["nationality"]})')
        pred_surnames = torch.topk(predict[idx], 3).indices.tolist()
        pred_surnames_ = ', '.join([le.classes_[i] for i in pred_surnames[1:]])
        print(f'predicted nationality: {le.classes_[pred_surnames[0]]} (or {pred_surnames_})\n')

Surname: Kamenchuk (Russian)
predicted nationality: Russian (or Polish, Czech)

Surname: Gladkij (Russian)
predicted nationality: Russian (or Czech, English)

Surname: Alkhazha (Arabic)
predicted nationality: Russian (or Czech, Greek)



## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [None]:
# 3.1 Считать файл yelp/raw_train.csv.
df_ = pd.read_csv('/content/drive/MyDrive/ML/5/yelp/raw_train.csv', header=None, names=['target', 'review'])
df_.head()

Unnamed: 0,target,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [None]:
# Оставить от исходного датасета 10% строчек.
df = df_.sample(int(df_.shape[0] * 0.1))
df.shape

(56000, 2)

In [None]:
# 3.2 Воспользоваться функцией preprocess_text из 1.1 для обработки текста отзыва.
def preprocess_text_v2(text: str):
  stopWords = stopwords.words('english') + list(punctuation)
  words = text.split()
  noStopWords = [word for word in words if word not in stopWords]
  return " ".join([stemmer.stem(word) for word in noStopWords])


df['review'] = df['review'].apply(preprocess_text)
df['review'] = df['review'].apply(preprocess_text_v2)
df['review'].dropna(inplace=True)

In [None]:
# Закодировать рейтинг числами, начиная с 0.

le = LabelEncoder()
le.fit(df['target'])
df['target'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,review
301789,1,i'v tri typic breakfast places... one way bett...
138968,1,husband came recent thought great deal love lo...
164812,1,sophist beauti setting. high recommend white s...
53649,1,"time fav pho restaurant, authentic, mountain l..."
538171,0,failur custom servic locat taste. complet fail...


In [None]:
df.to_csv('/content/drive/MyDrive/ML/5/yelp/preprocessed.csv')

In [None]:
# 3.4 Реализовать класс Vocab (токен = слово)

class Vocab:
  def __init__(self, data):
    self.token_to_idx = {}
    self.idx_to_token = {}
    for line in data:
      if type(line) == str:
        line = word_tokenize(line)
        for word in line:
          if word not in self.token_to_idx:
            self.token_to_idx[word] = len(self.token_to_idx)
            self.idx_to_token[len(self.token_to_idx)] = word
    self.vocab_len = len(self.token_to_idx)

In [None]:
# 3.5 Реализовать класс ReviewDataset
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    tensor = torch.zeros(vocab.vocab_len)
    review_list = word_tokenize(review)
    for word in review_list:
        tensor[vocab.token_to_idx[word]] = 1
    return tensor

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X = self.vectorize(self.X[idx])
    y = self.y[idx]
    return X, y

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ML/5/yelp/preprocessed.csv', index_col=[0])
df.head()

Unnamed: 0,target,review
301789,1,i'v tri typic breakfast places... one way bett...
138968,1,husband came recent thought great deal love lo...
164812,1,sophist beauti setting. high recommend white s...
53649,1,"time fav pho restaurant, authentic, mountain l..."
538171,0,failur custom servic locat taste. complet fail...


In [None]:
df = df.dropna()

In [None]:
# 3.3 Разбить датасет на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['target'],
                                                    train_size=0.85,
                                                    random_state=42)

In [None]:
%%time
vocab = Vocab(df['review'])
train_data = ReviewDataset(X_train.values, y_train.values, vocab)
test_data = ReviewDataset(X_test.values, y_test.values, vocab)

CPU times: user 42.9 s, sys: 70.4 ms, total: 43 s
Wall time: 44.9 s


In [None]:
batch_size = 30
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
# 3.6 Обучить классификатор
class ReviewClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size, hidden_size):
        super(ReviewClassifier, self).__init__()
        self.linear1 = nn.Linear(vocab_size, hidden_size * 2)
        self.linear2 = nn.Linear(hidden_size * 2, hidden_size)
        self.linear3 = nn.Linear(hidden_size, num_labels)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        lin1 = self.relu(self.linear1(inputs))
        lin2 = self.relu(self.linear2(lin1))
        return self.linear3(lin2)

In [None]:
import time

def time_since(since):
    now = time.time()
    s = now - since
    m = np.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [None]:
vocab_size = vocab.vocab_len
num_labels = 3
hidden_size = 128
lr=0.005

model = ReviewClassifier(num_labels, vocab_size, hidden_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

log_interval = 1
n_epoch = 4
losses_list = []

start = time.time()

model.train()
for epoch in range(1, n_epoch + 1):
    losses = []
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for idx, (data, target) in enumerate(train_dataloader):
        model.zero_grad()
        predict = model(data)
        loss = loss_function(predict, target)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    losses_list.append(np.mean(losses))
    if epoch % log_interval == 0:
        print("Epoch: {}/{} ({:.2f})%|".format(epoch, n_epoch, epoch / n_epoch * 100),
              "{}|".format(time_since(start)),
                  "Training loss: {:.3f}..".format(np.mean(losses_list)))

Epoch: 1/4 (25.00)%| 9m 0s| Training loss: 0.264..
Epoch: 2/4 (50.00)%| 18m 27s| Training loss: 0.197..
Epoch: 3/4 (75.00)%| 28m 30s| Training loss: 0.152..
Epoch: 4/4 (100.00)%| 38m 34s| Training loss: 0.123..


In [None]:
torch.save(model.state_dict(), 'model_review.pt')

In [None]:
# 3.7 Измерить точность на тестовой выборке.

losses_t = []
correct = 0
sum = 0
for idx_, (data, target) in enumerate(test_dataloader):
    # model.no_grad()
    predict = model(data)
    for idx in range(data.size()[0]):
        if predict[idx].topk(1)[1].item() == target[idx].item():
          correct += 1
    sum += data.size()[0]
    loss = loss_function(predict, target)
    losses_t.append(loss.item())

print("Test loss: {:.3f}..".format(np.mean(losses_t)))
print(f'{correct} / {sum}, {correct / sum * 100:.3}%')

Test loss: 0.871..
7537 / 8400, 89.7%


In [None]:
# Проверить работоспособность модели: придумать небольшой отзыв,
# прогнать его через модель и вывести номер предсказанного класса
#  (сделать это для явно позитивного и явно негативного отзыва)
# 0 - плохо
# 1 - хорошо

bad_revs = ['unhealthy food', 'disgusting view']
good_revs = ['I love this place, best cupcakes', 'amazing view']
pos_neg = ['bad', 'good']

for rev in bad_revs:
    pred_ = model(train_data.vectorize(preprocess_text_v2(rev))).topk(1)[1].item()
    print(f'{rev} (bad review) => (predicted) {pos_neg[pred_]}')

for rev in good_revs:
    pred_ = model(train_data.vectorize(preprocess_text_v2(rev))).topk(1)[1].item()
    print(f'{rev} (good review) => (predicted) {pos_neg[pred_]}')

unhealthy food (bad review) => (predicted) bad
disgusting view (bad review) => (predicted) bad
I love this place, best cupcakes (good review) => (predicted) good
amazing view (good review) => (predicted) good
