In [None]:
import time

import pandas as pd
import numpy as np

import torch
import torchtext as tt
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import Vocab

import typing
from typing import List
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/hinglish-english/Langauge_Data.csv')
df_valid = pd.read_csv('../input/hinglish-english/testing_data_.csv')

In [None]:
df_valid.info()
df_valid.head()

In [None]:
df_valid = df_valid.rename(columns = {'word':'Word', 'label':'Langauge'})
df_valid = df_valid.drop(['Unnamed: 0'], axis = 1)

In [None]:
df_valid

In [None]:
def tokenize(word):
    return [c for c in word]

def cleaner(word):
    word = str(word)
    word = word.lower()
    
    word = clear_punk(word)
    word = word.strip()
    return word

def build_vocab(data, min_fre, tokenizer):
    counter = Counter()
    for word in data:
        word = cleaner(word)
        counter.update(tokenizer((word)))
    return Vocab(counter,min_freq=min_fre , specials=( '<unk>','<pad>', '<sos>', '<eos>'))

In [None]:
vocab = build_vocab(df_train['Word'], min_fre=1, tokenizer = tokenize)
vocab.stoi

In [None]:
df_train['Word'] = df_train['Word'].apply(lambda x: str(x))
(df_train['Word'].apply(len)).max()

**Modelling**

In [None]:
class LangData(Dataset):
    def __init__(self, df, vocab, maxlen=32):
        self.df = df
        self.vocab = vocab
        self.maxlen = maxlen
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        df = self.df
        word = str(df.loc[idx, 'Word'])
        word = cleaner(word)
        tokens = tokenize(word)
        tokens = [vocab.stoi[token] for token in tokens]
        tokens += [1] * (self.maxlen - len(tokens))
        
        label = int(df.loc[idx, 'Langauge'])
        
        return np.array(tokens), label

In [None]:
class LangDetectNet(nn.Module):
    def __init__(self, vocab_size:int, embedding_dim:int, n_layers:int, hidden_dim: int):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(input_size = embedding_dim,
                            hidden_size = hidden_dim, 
                            num_layers = n_layers, 
                            dropout=0, 
                            bidirectional=True,
                            batch_first = True)
        self.fc1 = nn.Linear(2*hidden_dim, 5)
        self.fc2 = nn.Linear(5,1)
        self.relu = nn.ReLU()
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_x: torch.TensorType):
        # shape of x: [seq_len, batch_size]
        x = self.embedding(input_x)
        #shape of x: [seq_len, batch_size, embedding_dim]
        outp, (hidden, cell) = self.LSTM(x)

        # shape of outp: [seq_len, batch_size, 2*hidden_dim]
        hidden_last = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        x = self.relu(self.fc1(hidden_last))
        x = self.fc2(x)
        x = self.sigmoid(x)

        return x

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
Progress_Bar = True

In [None]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    model.train()
    bar = tqdm(iterator) if Progress_Bar else iterator
    
    for (x, y) in bar:
        x = torch.LongTensor(x)
        
        y = y.float()
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_pred = model(x)
        y_pred = torch.reshape(y_pred, (-1,))
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        loss_np = loss.detach().cpu().numpy()
        epoch_loss += loss_np
        if Progress_Bar:
            bar.set_description('Training loss: %.5f' % (loss_np))
        
    return epoch_loss/len(iterator)

def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    preds = []
    preds = np.array(preds)
    targets = []
    targets = np.array(targets)
    model.eval()
    bar = tqdm(iterator) if Progress_Bar else iterator
    
    with torch.no_grad():
        
        for (x, y) in bar:
            x = torch.LongTensor(x)
            y = y.float()
            x = x.to(device)
            y = y.to(device)
            y_pred = model(x)
            y_pred = torch.reshape(y_pred, (-1,))
            loss = criterion(y_pred, y)
            loss_np = loss.detach().cpu().numpy()
            epoch_loss += loss_np
            
            y_pred = y_pred.detach().cpu().numpy()
            y_pred = [1 if pred>0.5 else 0 for pred in y_pred]
            preds = np.append(preds, y_pred)
            targets = np.append(targets, y.detach().cpu().numpy())
#             preds = preds.reshape(-1)
#             targets = targets.reshape(-1)
            
            if Progress_Bar:
                bar.set_description('Validation loss: %.5f' % (loss_np))
            
            
     
    return epoch_loss/len(iterator), accuracy_score(preds, targets)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def fit_model(model, model_name, train_iterator, valid_iterator, optimizer, loss_criterion, device, epochs):
    best_valid_loss = float('inf')
    
    train_losses = []
    valid_losses = []
    valid_metric_scores = []
    
    for epoch in range(epochs):
    
        start_time = time.time()
    
        train_loss = train(model, train_iterator, optimizer, loss_criterion, device)
        valid_loss, valid_metric_score = evaluate(model, valid_iterator, loss_criterion, device)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        valid_metric_scores.append(valid_metric_score)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f'{model_name}.pt')
    
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
        print(f'Epoch: {epoch+1: 2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'Train Loss: {train_loss:.3f}')
        print(f'Val. Loss: {valid_loss:.3f} |  Val. Metric Score: {valid_metric_score:.3f}')
        
    return train_losses, valid_losses, valid_metric_scores

In [None]:
train_data = LangData(df_train, vocab)
train_loader = DataLoader(train_data, shuffle = True, batch_size = 64, num_workers = 2)

valid_data = LangData(df_valid, vocab)
valid_loader = DataLoader(valid_data, shuffle = True, batch_size = 64, num_workers = 2)

model = LangDetectNet(vocab_size = len(vocab.stoi),
                   embedding_dim = 3,
                    hidden_dim = 5,
                   n_layers = 2 ).to(device)

loss_criterion = nn.BCELoss()

opt = optim.Adam(model.parameters(), lr = 1e-2, betas=(0.9,0.999))
fit_model(model, 'LangaugeDetection',  train_loader, valid_loader, opt, loss_criterion, device, epochs = 7)