In [19]:
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [20]:
df = pd.read_csv('icdc\\train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ben     1700 non-null   object
 1   guj     1700 non-null   object
 2   hin     1700 non-null   object
 3   kan     1700 non-null   object
 4   mal     1700 non-null   object
 5   ori     1700 non-null   object
 6   pan     1700 non-null   object
 7   tam     1700 non-null   object
 8   tel     1700 non-null   object
 9   urd     1700 non-null   object
 10  eng     1700 non-null   object
dtypes: object(11)
memory usage: 146.2+ KB


In [21]:
allTexts = ''
for i in tqdm(range(df.__len__())):
    allTexts += ''.join(df.iloc[i]).lower().replace('–','').replace('$','').replace('&','').replace('[','').replace(']',''
                                            ).replace('“','').replace('”','').replace('=','').replace('৷','').replace('`','').replace('ؑ', '').replace('}',''
                                            ).replace('-', '').replace('*', '').replace('^', '')

100%|██████████| 1700/1700 [00:00<00:00, 2934.49it/s]


In [22]:
hinglish_res = Counter(allTexts)
# sorted(list(dict(hinglish_res).items()), key = lambda x: x[1], reverse=True)
charsVocab = list(dict(hinglish_res).items())

In [23]:
PAD_NULL = '-'
PAD_START = '*'
PAD_END = '^'

vocab = [PAD_NULL, PAD_START, PAD_END]+[i[0] for i in charsVocab]

IDX_PAD_NULL = vocab.index(PAD_NULL)

len(vocab), IDX_PAD_NULL

(63, 0)

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau#, StepLR, ExponentialLR
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader, Subset, random_split


import numpy as np
import random
import os

In [7]:
def add_extraToken(texts, startToken=True, endToken=True):
    if startToken and endToken: return [PAD_START+text+PAD_END for text in texts]
    elif startToken: return [PAD_START+text for text in texts]
    elif endToken: return [text+PAD_END for text in texts]
    else: return texts

def remove_extraToken(texts:list[str])->list[str]:
    return [text.lower().replace('–','').replace('$','').replace('&','').replace('[','').replace(']',''
                                            ).replace('“','').replace('”','').replace('=','').replace('৷','').replace('`','').replace('ؑ', '').replace('}',''
                                            ).replace(PAD_START, '').replace(PAD_END, '').replace(PAD_NULL, '')
            for text in texts]

def preprocesser(texts: list[str], prePadding=False, startToken=True, endToken=True, batch_first=False):
    texts = add_extraToken(remove_extraToken(texts), startToken, endToken)
    text_ints = [[vocab.index(c) for c in text if c in vocab] for text in texts]
    # Apply pre-padding to each sequence
    if prePadding:
        max_length = max(len(seq) for seq in text_ints)
        padded_seqs = pad_sequence([torch.cat([torch.tensor([IDX_PAD_NULL]*(max_length - len(seq)), dtype=torch.int64), torch.LongTensor(seq)]) for seq in text_ints], batch_first=True)
    else:
        padded_seqs = pad_sequence([torch.LongTensor(seq) for seq in text_ints], batch_first=True, padding_value=IDX_PAD_NULL)
    
    return padded_seqs if batch_first else padded_seqs.T


preprocesser(['hiir', 'laksfffh'], startToken=True, endToken=False)

tensor([[ 1,  1],
        [14, 19],
        [ 4, 13],
        [ 4, 17],
        [12,  3],
        [ 0, 37],
        [ 0, 37],
        [ 0, 37],
        [ 0, 14]])

In [25]:
class CustomDataset(Dataset):
    def __init__(self, batch_size=64):
        dataset = []

        for y, col in enumerate(df.columns):
            for i in range(df[col].__len__()):
                text = df[col].iloc[i].lower().replace('–','').replace('$','').replace('&','').replace('[','').replace(']',''
                                            ).replace('“','').replace('”','').replace('=','').replace('৷','').replace('`','').replace('ؑ', '').replace('}',''
                                            ).replace(PAD_START, '').replace(PAD_END, '').replace(PAD_NULL, '')
                dataset.append((text, y))
        
        dataset.sort(key=lambda x: len(x[0]))
        
        self.batched = []
        for i in range(0, len(dataset), batch_size): self.batched.append(self.custom_collate_fn(dataset[i:i+batch_size]))
    
    def custom_collate_fn(self, batch):
        x = []
        y = []
        for ix, iy in batch:
            x.append(ix)
            y.append(iy)
        return preprocesser(x), F.one_hot(torch.tensor(y), num_classes=11).to(torch.float32)

    def __len__(self):
        return len(self.batched)
    
    def __getitem__(self, idx):
        # Return a single sequence and its label
        return self.batched[idx]

# Create a DataLoader with batch size 64
custom_dataset = CustomDataset(batch_size=64)  # Create an instance of the custom dataset
data_loader = DataLoader(custom_dataset, batch_size=1, shuffle=True)
# Iterate through the DataLoader
for batch in data_loader:
    sequences, labels = batch
    sequences.squeeze_(0)
    labels.squeeze_(0)
    break

In [26]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_size, num_layers, vocab_size, p=0, num_classes=11):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=p, bidirectional=False) 
        # self.fc1 = nn.Linear(hidden_size, hidden_size*2)
        # self.fc2 = nn.Linear(hidden_size*2, num_classes)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, x):
        # (sequencen x batch_size)
        x = self.dropout(self.embedding(x)) # (sequencen x batch_size x embedding_dim)
        outputs, (hidden, cell) = self.lstm(x) # (sequencen x batch_size x hidden_size), ((num_layers x batch_size x hidden_size), (num_layers x batch_size x hidden_size))
        return self.fc(outputs[-1])
        # x = F.relu(self.fc1(outputs[-1]))
        # return self.fc2(x)


# Create an LSTM model
# model = Encoder(50, 128, 2, vocab_size=len(vocab)).to(DEVICE)
# x = sequences
# y = labels
# print(x.shape)
# model(x).shape

In [27]:
# hyperparameters
LR = 0.001
EMBEDDING_SIZE = 50
HIDDEN_SIZE = 128
NUM_LAYERS = 2
P = 0.5
BATCH_SIZE = 64
EPOCHS = 100
TRAIN_SIZE = .8
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [28]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def accuracy(model, data_loader):
    # Set the model to evaluation mode
    model.eval()

    correct = 0
    total = 0
    
    # Disable gradient computation during inference
    for (sequences, labels) in data_loader: # test_loader
        sequences = sequences.squeeze(0).to(DEVICE)
        labels = labels.squeeze(0).to(DEVICE).argmax(dim=1)
        # Forward pass
        predicted = model(sequences).argmax(dim=1)
            
        # Count total number of labels
        total += labels.size(0)
        
        # Count number of correct predictions
        correct += (predicted == labels).sum().item()
    model.train()
    # Calculate accuracy
    return 100 * correct / total
    # print('Accuracy: {:.2f}%'.format(accuracy))

In [29]:
# Create a DataLoader with batch size 64
custom_dataset = CustomDataset(BATCH_SIZE)

train_size = int(0.8 * len(custom_dataset))
test_size = len(custom_dataset) - train_size

train_dataset, test_dataset = random_split(custom_dataset, [train_size, test_size])

# Create data loaders for train and test sets
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model = Encoder(EMBEDDING_SIZE, HIDDEN_SIZE, NUM_LAYERS, vocab_size=len(vocab), p=P, num_classes=11).to(device=DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, patience=10)
criterion = nn.CrossEntropyLoss()

In [30]:
model.train()
start = time.time()
for epoch in range(EPOCHS):
    total_loss = 0
    # Iterate through the DataLoader
    model.train()
    for (sequences, labels) in train_loader:
        sequences = sequences.squeeze(0).to(DEVICE)
        labels = labels.squeeze(0).to(DEVICE)
        
        output = model(sequences)
        
        loss = criterion(output, labels)
        total_loss += loss.item()
    
        model.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation phase
    valid_loss = 0
    model.eval()
    with torch.no_grad():
        for (sequences, labels) in test_loader:
            sequences = sequences.squeeze(0).to(DEVICE)
            labels = labels.squeeze(0).to(DEVICE)
        
            output = model(sequences)
        
            loss = criterion(output, labels)
            valid_loss += loss.item()
        
    print('[{}] Train Epoch: [{}/{}] \tLoss: {:.2f} Test Loss: {:.2f}'.format(
            time_since(start), epoch, EPOCHS,
            total_loss, valid_loss*len(train_loader)/len(test_loader)))
    
    scheduler.step(valid_loss)

[0m 1s] Train Epoch: [0/100] 	Loss: 420.80 Test Loss: 302.68
[0m 3s] Train Epoch: [1/100] 	Loss: 296.14 Test Loss: 229.76
[0m 5s] Train Epoch: [2/100] 	Loss: 233.17 Test Loss: 198.54
[0m 6s] Train Epoch: [3/100] 	Loss: 203.94 Test Loss: 155.61
[0m 7s] Train Epoch: [4/100] 	Loss: 174.66 Test Loss: 149.46
[0m 9s] Train Epoch: [5/100] 	Loss: 156.74 Test Loss: 126.70
[0m 10s] Train Epoch: [6/100] 	Loss: 147.91 Test Loss: 150.09
[0m 12s] Train Epoch: [7/100] 	Loss: 147.43 Test Loss: 118.94
[0m 14s] Train Epoch: [8/100] 	Loss: 125.21 Test Loss: 110.80
[0m 15s] Train Epoch: [9/100] 	Loss: 109.31 Test Loss: 100.82
[0m 17s] Train Epoch: [10/100] 	Loss: 104.15 Test Loss: 96.16
[0m 18s] Train Epoch: [11/100] 	Loss: 93.10 Test Loss: 80.57
[0m 20s] Train Epoch: [12/100] 	Loss: 84.94 Test Loss: 81.97
[0m 22s] Train Epoch: [13/100] 	Loss: 83.85 Test Loss: 77.63
[0m 23s] Train Epoch: [14/100] 	Loss: 76.23 Test Loss: 65.69
[0m 25s] Train Epoch: [15/100] 	Loss: 72.83 Test Loss: 68.07
[0m 26s] Train Epoc

In [31]:
print('Train Accuracy: {:.2f}%'.format(accuracy(model, train_loader)))
print('Test Accuracy: {:.2f}%'.format(accuracy(model, test_loader)))

Train Accuracy: 99.95%
Test Accuracy: 96.19%


In [17]:
# Save the model
torch.save(model.state_dict(), 'models_icdc\\last.model.pth')

# Load the model
loaded_model = Encoder(50, 128, 2, vocab_size=64, p=.5, num_classes=11)
loaded_model.load_state_dict(torch.load('models_icdc\\last.model.pth'))

<All keys matched successfully>

In [32]:
def inference(model, texts:list[str]|str):
    if isinstance(texts, str): texts = [texts]
    model.eval()
    results = []
    with torch.no_grad():
        outputs =  model(preprocesser(texts)).argmax(dim=1).cpu().numpy()
        for i in outputs:
            results.append(['ben', 'guj', 'hin', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'urd', 'eng'][i])
    return results
inference(loaded_model, 'alute masala makhie, fetano basena chubie nie dubo tele bhaja yatakshan na bhalo kare bhaja hachche, tiri kara has maharashtrer ei suswadu o janapriya khavarer pad.')

['ben']

In [333]:
mine = []
for i in pd.read_csv('icdc\\language classification\\classification-submission.csv').text:
    lang = inference(loaded_model, i)[0]
    mine.append((i, lang))

In [334]:
blang = inference(loaded_model, [i for i in pd.read_csv('icdc\\language classification\\classification-submission.csv').text])
mine = [(i0, i1, i2) for (i0, i1), i2 in zip(mine, blang)]

In [335]:
o = pd.DataFrame(mine, columns=['text', 'lang', 'langbatched'])

In [341]:
o.to_csv('icdc\\language classification\\classification-submission-mine.csv', index=False)

In [340]:
o['bothequal'] = (o.lang == o.langbatched)