In [1]:
!pip install fasttext
!pip install scikit-learn



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import fasttext.util
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

In [3]:
from tqdm import tqdm

# Load FastText word embeddings
import fasttext.util
# fasttext.util.download_model('en', if_exists='ignore')  # English
# ft = fasttext.load_model('/content/new_hing_emb')

In [4]:

# Load CSV data
train_df = pd.read_csv('codemix-main/train.csv')
test_df = pd.read_csv('codemix-main/test.csv')
validation_df = pd.read_csv('codemix-main/valid.csv')
# shhuffle all three dataframes
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)
validation_df = validation_df.sample(frac=1).reset_index(drop=True)



In [5]:
train_df

Unnamed: 0.1,Unnamed: 0,labels,tweets
0,48222,0,ak arre bhai na to youtube pr or na hi ary zep...
1,74559,1,agar kismat achi hui toh ek hi mahine me natio...
2,13228,1,student teacher madam ap murghi hn kia teacher...
3,29284,1,tha great lmmfao it was
4,100990,0,ko itminan se k logo k liye kaam karne do othe...
...,...,...,...
114995,40488,0,mohan ye konse terrorist camps hain jo tabah k...
114996,100171,1,officially a nerd and proud of it lan party fo...
114997,47267,1,aaj ki taza khabar news channels se covid hua ...
114998,43552,0,nambiar thank godindia aur pakistan saath ek n...


In [6]:

# Extract text and labels
train_sentences = train_df['tweets'].tolist()[0:18000]
train_labels = train_df['labels'].tolist()[0:18000]

test_sentences = test_df['tweets'].tolist()[0:5000]
test_labels = test_df['labels'].tolist()[0:5000]

validation_sentences = validation_df['tweets'].tolist()[0:8000]
validation_labels = validation_df['labels'].tolist()[0:8000]


In [7]:
ft = fasttext.train_unsupervised('codemix-main/train.csv', model='skipgram', dim=300)

Read 2M words
Number of words:  22107
Number of labels: 0
Progress: 100.0% words/sec/thread:    4823 lr:  0.000000 avg.loss:  2.154565 ETA:   0h 0m 0s


In [8]:
# Convert sentences to FastText word embeddings
def sentence_to_embeddings(sentence):
    embeddings = [ft[word] if word in ft else ft['<unk>'] for word in sentence]
    return torch.tensor(embeddings, dtype=torch.float32)

train_indexed_sentences = [sentence_to_embeddings(sentence.split()) for sentence in tqdm(train_sentences)]
test_indexed_sentences = [sentence_to_embeddings(sentence.split()) for sentence in tqdm(test_sentences)]
validation_indexed_sentences = [sentence_to_embeddings(sentence.split()) for sentence in tqdm(validation_sentences)]

# Padding sequences to a fixed length
max_sequence_length = max(len(sentence) for sentence in train_indexed_sentences + test_indexed_sentences + validation_indexed_sentences)

train_padded_sequences = [torch.cat((sentence, torch.zeros(max_sequence_length - len(sentence), 300))) for sentence in train_indexed_sentences]
test_padded_sequences = [torch.cat((sentence, torch.zeros(max_sequence_length - len(sentence), 300))) for sentence in test_indexed_sentences]
validation_padded_sequences = [torch.cat((sentence, torch.zeros(max_sequence_length - len(sentence), 300))) for sentence in validation_indexed_sentences]

# Split data into training and testing sets
X_train = torch.stack(train_padded_sequences)
y_train = torch.tensor(train_labels, dtype=torch.float)

X_test = torch.stack(test_padded_sequences)
y_test = torch.tensor(test_labels, dtype=torch.float)

X_validation = torch.stack(validation_padded_sequences)
y_validation = torch.tensor(validation_labels, dtype=torch.float)


  return torch.tensor(embeddings, dtype=torch.float32)
100%|██████████| 18000/18000 [00:22<00:00, 789.24it/s]
100%|██████████| 5000/5000 [00:06<00:00, 813.29it/s]
100%|██████████| 8000/8000 [00:10<00:00, 744.83it/s]


In [9]:

# Create a PyTorch Dataset and DataLoader
class SarcasmDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

train_dataset = SarcasmDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = SarcasmDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

validation_dataset = SarcasmDataset(X_validation, y_validation)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=True)
# Build a BiLSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.bilstm = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first = True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        lstm_out, (hn,cn) = self.bilstm(text)
        out_tensor = torch.cat([t for t in hn], dim=1)
        return self.fc(out_tensor)

input_dim = 300  # FastText embedding dimension
hidden_dim = 128
output_dim = 1

model = BiLSTMModel(input_dim, hidden_dim, output_dim)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [10]:
import torch
from tqdm import tqdm

# Training loop
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    
    for batch in iterator:
        text, labels = batch
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

# Evaluation
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, labels)
            predicted_class = torch.sigmoid(predictions) > 0.5
            correct += torch.sum(predicted_class == labels).item()
            total += len(predicted_class)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator), correct / total

N_EPOCHS = 18
best_validation_loss = float('inf')  # Initialize with a high value

for epoch in tqdm(range(N_EPOCHS)):
    training_loss = train(model, train_loader, optimizer, criterion)
    validation_loss, validation_accuracy = evaluate(model, validation_loader, criterion)
    print(f'Epoch {epoch+1}/{N_EPOCHS}')
    print(f'Training Loss: {training_loss:.4f}')
    print(f'Validation Loss: {validation_loss:.4f}')
    print(f'Validation Accuracy: {validation_accuracy:.4f}')
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        # Save the model when validation loss improves
        torch.save(model.state_dict(), 'base_model.pth')
    else:
        print("Validation loss increased. Stopping training.")
        break
    # torch.save(model.state_dict(), 'base_model.pth')


  6%|▌         | 1/18 [01:04<18:22, 64.86s/it]

Epoch 1/18
Training Loss: 0.4796
Validation Loss: 0.3983
Validation Accuracy: 0.8364


 11%|█         | 2/18 [02:06<16:44, 62.79s/it]

Epoch 2/18
Training Loss: 0.3585
Validation Loss: 0.3375
Validation Accuracy: 0.8650


 17%|█▋        | 3/18 [03:07<15:29, 61.98s/it]

Epoch 3/18
Training Loss: 0.3127
Validation Loss: 0.3232
Validation Accuracy: 0.8710


 22%|██▏       | 4/18 [04:09<14:27, 61.96s/it]

Epoch 4/18
Training Loss: 0.2908
Validation Loss: 0.3106
Validation Accuracy: 0.8752


 22%|██▏       | 4/18 [05:11<18:08, 77.78s/it]

Epoch 5/18
Training Loss: 0.2694
Validation Loss: 0.3205
Validation Accuracy: 0.8698
Validation loss increased. Stopping training.





In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the saved model
model.load_state_dict(torch.load('base_model.pth'))

# Evaluate on the validation set
validation_loss, validation_accuracy = evaluate(model, validation_loader, criterion)
print(f'Validation Loss: {validation_loss:.4f}')
print(f'Validation Accuracy: {validation_accuracy:.4f}')

# Get true labels and predicted labels
true_labels, predicted_labels = [], []
model.eval()

with torch.no_grad():
    for batch in test_loader:
        text, labels = batch
        predictions = model(text).squeeze(1)
        predicted_class = torch.sigmoid(predictions) > 0.5
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted_class.cpu().numpy())

# Calculate precision, recall, and F1 score
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Test Precision: {precision:.4f}')
print(f'Test Recall: {recall:.4f}')
print(f'Test F1 Score: {f1:.4f}')


Validation Loss: 0.3106
Validation Accuracy: 0.8752
Test Accuracy: 0.8722
Test Precision: 0.8750
Test Recall: 0.8726
Test F1 Score: 0.8720


: 