# Unit 3 - Text Classification
**Use recurrent neural networks to classify texts**
 
https://www.kaggle.com/competitions/nlp-txt-classification

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [61]:
from pathlib import Path
from typing import List, Dict

import time
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
# import torch.optim as optim
# from torch.utils.data.dataset import random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# from nltk.tokenize import WordPunctTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# from sklearn.metrics import f1_score
# from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
import scikitplot as skplt

In [3]:
DATA_PATH = Path('/kaggle/input/nlp-txt-classification')

SEED = 42
NUM_CLASSES = 5
MAX_VOCAB_SIZE = 250000
BATCH_SIZE = 64

In [4]:
df_train = pd.read_csv(DATA_PATH / 'train.csv')
df_train = df_train[['Text', 'Sentiment']]
df_train.head(10)

In [5]:
df_train['Sentiment'].value_counts()

See at null values

In [6]:
df_train.isna().sum()

Several nulls found. Ignore them

In [7]:
df_train = df_train.dropna()
df_train.shape

See at the balance of classes

In [8]:
count_df = df_train.groupby('Sentiment')\
    .aggregate({'Text': 'count'})\
    .reset_index()\
    .sort_values('Text', ascending=False)
px.bar(count_df, x='Sentiment', y='Text')

In [9]:
df_test = pd.read_csv(DATA_PATH / 'test.csv')
df_test.head(10)

In [10]:
replacement_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is", 
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

In [11]:
class TextTransform:
    def __call__(self, text: str) -> str:
        return text


class WordTransform(TextTransform):
    delimiter = ' '

    def __call__(self, text: str) -> str:
        words = text.split(self.delimiter)
        result = (
            self.transform_word(word)
            for word in words
        )
        return self.delimiter.join([
            word
            for word in result
            if word
        ])
        
    def transform_word(self, word: str) -> str:
        return word

    
class LowerTransform(TextTransform):
    def __call__(self, text: str) -> str:
        return text.lower()


class TrimTransform(TextTransform):
    def __call__(self, text: str) -> str:
        return text.strip()

    
class ReplaceWordTransform(WordTransform):
    def __init__(self, items: Dict[str, str]):
        self.items = items
    
    def transform_word(self, word: str) -> str:
        return self.items.get(word, word)


import re
class RegexpWordTransform(WordTransform):
    def __init__(self, pattern: str, to: str = ''):
        self.pattern = re.compile(pattern)
        self.to = to

    def transform_word(self, word: str) -> str:
        return self.pattern.sub(self.to, word)


from urllib.parse import urlparse
class DropUrlTransform(WordTransform):
    def transform_word(self, word: str) -> str:
        if self.is_url(word):
            return ''
        return word

    def is_url(self, word: str) -> bool:
        try:
            result = urlparse(word)
            return all([result.scheme, result.netloc])
        except:
            return False


class TextTransformComposition(TextTransform):
    def __init__(self, items: List[TextTransform]):
        self.items = items

    def __call__(self, text: str) -> str:
        result = text
        for t in self.items:
            result = t(result)
        return result


In [12]:
transform = TextTransformComposition([
    LowerTransform(),
    DropUrlTransform(),
    ReplaceWordTransform(replacement_dict),
    RegexpWordTransform(r'#[\w\d_]+'), # hashtag remover
    TrimTransform(),
    LowerTransform(),
])

See how `transform` works on sample of text from dataset

In [13]:
text = df_train.iloc[7, 0]

print(f':::{text}:::')
print(f':::{transform(text)}:::')

Apply text tranform to datasets

In [14]:
df_train['Text'] = df_train['Text'].apply(transform)
df_test['Text'] = df_test['Text'].apply(transform)

In [42]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train['Text'],
    df_train['Sentiment'],
    stratify=df_train['Sentiment'],
    test_size=0.2,
    random_state=SEED,
)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [43]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(list(X_train) + list(X_val) + list(df_test['Text']))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(df_test['Text'])

In [17]:
all_texts = pd.concat([df_train['Text'], df_test['Text']], axis=0)
lens = all_texts.apply(lambda s: len(s))
lens.plot.hist(bins=50)

In [44]:
MAX_TEXT_LEN = 355

X_train = pad_sequences(X_train, maxlen=MAX_TEXT_LEN)
X_val = pad_sequences(X_val, maxlen=MAX_TEXT_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_TEXT_LEN)

In [45]:
le = LabelEncoder()
y_train = le.fit_transform(y_train.values)
y_val = le.transform(y_val.values)

Load and setup Glove embeddings

In [20]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

In [21]:
## FUNCTION FROM https://www.kaggle.com/gmhost/gru-capsule

def load_glove(word_index):
    EMBEDDING_FILE = 'glove.840B.300d.txt'
    def get_coefs(word,*arr):
        return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    nb_words = min(MAX_VOCAB_SIZE, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= MAX_VOCAB_SIZE: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [22]:
embedding_matrix = load_glove(tokenizer.word_index)

In [26]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_matrix, num_classes: int, dropout: float):
        super(BiLSTM, self).__init__()
        max_features, embed_size = embedding_matrix.shape
        self.hidden_size = 64
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(64, num_classes)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_lstm, _ = self.lstm(h_embedding)
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [37]:
model = BiLSTM(embedding_matrix, num_classes=len(le.classes_), dropout=0.2)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)

In [38]:
device = torch.device('cuda') if torch.cuda.is_available() else  torch.device('cpu')
device

In [39]:
model.to(device)

In [46]:
# Load train and test in CUDA Memory
X_train = torch.tensor(X_train, dtype=torch.long).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_val = torch.tensor(X_val, dtype=torch.long).to(device)
y_val = torch.tensor(y_val, dtype=torch.long).to(device)
X_test = torch.tensor(X_test, dtype=torch.long).to(device)

In [47]:
# Create Torch datasets
train = torch.utils.data.TensorDataset(X_train, y_train)
valid = torch.utils.data.TensorDataset(X_val, y_val)

In [48]:
# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False)

In [55]:
train_loss = []
valid_loss = []

n_epochs = 3 # Если сделать больше, не сохранится версия: нет GPU и учится слишком долго.

for epoch in range(n_epochs):
    start_time = time.time()
    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration -Doesn't get trained here
    model.eval()
    avg_val_loss = 0.
    val_preds = np.zeros((len(X_val), len(le.classes_)))
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * BATCH_SIZE : (i+1) * BATCH_SIZE] = F.softmax(y_pred).cpu().numpy()
    
    # Check Accuracy
    val_preds_tensor = torch.tensor(val_preds.argmax(axis=1)).to(device)
    val_accuracy = sum(torch.eq(val_preds_tensor, y_val)) / len(y_val)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print(f'Epoch {epoch + 1}/{n_epochs} \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f}  \t val_acc={val_accuracy:.4f}  \t time={elapsed_time:.2f}s')

In [56]:
def plot_graph(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , train_loss, label='train')
    plt.plot(list(np.arange(epochs) + 1), valid_loss, label='validation')
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(loc='best')

In [59]:
plot_graph(n_epochs)

A bit of overfitting (

In [62]:
y_true = [le.classes_[x] for x in y_val]
y_pred = [le.classes_[x] for x in val_preds.argmax(axis=1)]
skplt.metrics.plot_confusion_matrix(
    y_true, 
    y_pred,
    figsize=(12,12),x_tick_rotation=90)

## Submission

In [65]:
df_submission = pd.read_csv(DATA_PATH / 'sample_submission.csv')
df_submission.head(10)

In [68]:
pred = model(X_test).detach()
pred = F.softmax(pred).cpu().numpy()
pred = pred.argmax(axis=1)

In [69]:
df_submission['Sentiment'] = le.classes_[pred]
df_submission.head(10)

In [70]:
df_submission.to_csv('submission.csv', index=False)