In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'gggsdsd:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5281104%2F8784850%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240721%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240721T105556Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D56425025cd3caf943a92aa8ac693d66d4c6df8e2a4461deb409d3bc320e93698217001bdaa3a78c681d84e89b378fab04257ccbaef6e8339d9a94c5e66cc32a99e08aecc6ed4bb82ee7603022ce670920b665809cc2712d4ef98513ed878d176747f3d80c40402ddb8d00f4d6c129dae513923d695411a3180443d6b2030e0843f8b029229381c731dfea74c837e2d091d29e4ec92ac57bde6e5e6d5b54e14325ca79616315d94e9938e87e071ca5340ee2fd6f847c4d2701ff7552416fa52f5ae3ccb0521b2148ef7950558fbf706b735df46cf0ec4de9c369c7448f98b2cd16bead18a95025b88dbe3c9b6ea8f14542094257955291ca2601d48d1e0055f6d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gggsdsd/balanced.csv


In [None]:
import pandas as pd
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def hindi_data(data_file):
    df = pd.read_csv(data_file, encoding='utf-8')
    texts = df['texts'].tolist()
    labels = df['label'].tolist()
    return texts, labels


In [None]:
data_file = '/kaggle/input/gggsdsd/balanced.csv'
texts, labels = hindi_data(data_file)

In [None]:
print(f"Total texts: {len(texts)}, Total labels: {len(labels)}")

Total texts: 38688, Total labels: 38688


In [None]:
class CharacterTokenizer:
    def __init__(self, texts):
        self.char_to_ix = {char: i+1 for i, char in enumerate(sorted(set(''.join(texts))))}
        self.char_to_ix['<PAD>'] = 0
        self.ix_to_char = {i: char for char, i in self.char_to_ix.items()}

    def __call__(self, text, max_length=128, padding='max_length', truncation=True):
        char_ids = [self.char_to_ix.get(char, 0) for char in text[:max_length]]
        if padding == 'max_length':
            char_ids = char_ids + [0] * (max_length - len(char_ids))
        attention_mask = [1] * len(char_ids)
        return {
            'input_ids': torch.tensor([char_ids]),
            'attention_mask': torch.tensor([attention_mask])
        }

    def get_vocab_size(self):
        return len(self.char_to_ix)

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
class BERTClassifier(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_classes):
        super(BERTClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert.embeddings.word_embeddings = self.embedding
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def prediction(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "spoken" if preds.item() == 1 else "written"

In [None]:
# Set up parameters
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 1e-5

In [None]:

train_texts, rem_texts, train_labels, rem_labels = train_test_split(texts, labels, train_size=0.6, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(rem_texts, rem_labels, test_size=0.5, random_state=42)

In [None]:
# Create character-level tokenizer
tokenizer = CharacterTokenizer(texts)
vocab_size = tokenizer.get_vocab_size()

In [None]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(vocab_size, 768, num_classes).to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
Validation Accuracy: 0.9203
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3869
           1       0.91      0.93      0.92      3869

    accuracy                           0.92      7738
   macro avg       0.92      0.92      0.92      7738
weighted avg       0.92      0.92      0.92      7738

Epoch 2/4
Validation Accuracy: 0.9227
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3869
           1       0.90      0.95      0.92      3869

    accuracy                           0.92      7738
   macro avg       0.92      0.92      0.92      7738
weighted avg       0.92      0.92      0.92      7738

Epoch 3/4
Validation Accuracy: 0.9315
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3869
           1       0.95      0.91      0.93      3869

    accuracy                           0.93      7738
   macro avg  

In [None]:
test_accuracy, test_report = evaluate(model, test_dataloader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(test_report)

Test Accuracy: 0.9331
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      3847
           1       0.93      0.94      0.93      3891

    accuracy                           0.93      7738
   macro avg       0.93      0.93      0.93      7738
weighted avg       0.93      0.93      0.93      7738

