TAMIL using *bert-base-multilingual-cased*

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization,Bidirectional,Lambda,LSTM,Embedding,Conv1D,MaxPooling1D,GlobalMaxPooling1D,LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting advertools
  Downloading advertools-0.16.4-py2.py3-none-any.whl.metadata (15 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting scrapy>=2.5.0 (from advertools)
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting twython>=3.8.0 (from advertools)
  Downloading twython-3.9.1-py3-none-any.whl.metadata (20 kB)
Collecting Twisted>=21.7.0 (from scrapy>=2.5.0->advertools)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy>=2.5.0->advertools)
  Downloading cssselect-1.2.0-py2.py3-no

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def load_dataset(base_dir='/content', lang='tamil'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TA-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_df = load_dataset()


Using device: cpu


In [None]:
dataset_df

Unnamed: 0,audio_path,transcript,class_label,gender
0,Nil,உருவத்தை வச்ச ஒருத்தன் கிண்டல் பண்றான் பாருங்க...,C,Unknown
1,Nil,காமெடி பண்ண சொன்னா ஒருத்தன உருவ கேலி பண்ணிட்டு...,C,Unknown
2,Nil,இந்த உருவத்தை வைத்து கிண்டல் கேலி பண்ணி சிரிக்...,C,Unknown
3,Nil,புரிஞ்சுக்கணும் மேடையில் ஒரு நாகரிகம்னு ஒன்னு ...,C,Unknown
4,Nil,என்னா மல மல அண்ணாமலை இது உலகத்தோட ஸ்டைலு உட்கா...,C,Unknown
...,...,...,...,...
509,Nil,கேவலப்படுத்த அசிங்கப்படுது அவமானப்படுறது இதெல்...,C,Unknown
510,Nil,ஏலே உன் மூஞ்சி மொகரையும் பாரு இல்லையே குற்றாலத...,C,Unknown
511,Nil,மிகப்பெரிய தவறு விஜய் பனி இருக்கான் நிச்சயம் ...,C,Unknown
512,Nil,ஏண்டா அயோக்கிய ராஸ்கல் நீ தூக்கிட்டு போவியா டா...,C,Unknown


In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_tamil_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])
label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

Label encoder saved to tamil_label_encoder.pkl


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings("bert-base-multilingual-cased", X_train.tolist())
X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", X_test.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=35, batch_size=32)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.2157 - loss: 2.5983 - val_accuracy: 0.5534 - val_loss: 1.2992
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.3703 - loss: 1.9612 - val_accuracy: 0.5340 - val_loss: 1.2921
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.4463 - loss: 1.7244 - val_accuracy: 0.6214 - val_loss: 1.2317
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.5070 - loss: 1.5253 - val_accuracy: 0.5922 - val_loss: 1.1970
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.5202 - loss: 1.5275 - val_accuracy: 0.5534 - val_loss: 1.2473
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.5399 - loss: 1.4036 - val_accuracy: 0.5534 - val_loss: 1.1937
Epoch 7/35
[1m13/13[0m [32m━━━━

In [None]:
from tensorflow.keras.models import load_model
model.save("tamil_classification_model.h5")
print("Model saved as 'tamil_classification_model.h5'")
def test_model(input_text, model_path="tamil_classification_model.h5"):
    loaded_model = load_model(model_path)
    print("Model loaded successfully.")
    preprocessed_text = preprocess_tamil_text(input_text)
    embedding = extract_embeddings("bert-base-multilingual-cased", [preprocessed_text])
    prediction = loaded_model.predict(embedding)
    predicted_label_index = np.argmax(prediction)
    predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]
    print(f"Input Text: {input_text}")
    print(f"Predicted Label: {predicted_label}")
    return predicted_label
new_text ="மற்ற மதங்களை ஆதரிக்கிறவர்கள் எல்லாம் துரோகிகள்."
predicted_label = test_model(new_text)



Model saved as 'tamil_classification_model.h5'




Model loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
Input Text: மற்ற மதங்களை ஆதரிக்கிறவர்கள் எல்லாம் துரோகிகள்.
Predicted Label: R


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_tamil_text(text):
    """Preprocess Tamil text."""
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    processed_text = ' '.join(tokens)
    return processed_text
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
test_file_path = '/content/TA-AT-test.xlsx'  
test_data = pd.read_excel(test_file_path)
test_data['cleaned_transcript'] = test_data['Transcript'].apply(preprocess_tamil_text)
bert_model_name = "bert-base-multilingual-cased"
test_embeddings = extract_embeddings(bert_model_name, test_data['cleaned_transcript'].tolist())
trained_model_path = "/content/tamil_classification_model.h5"  
trained_model = load_model(trained_model_path)
label_encoder_path = "/content/tamil_label_encoder.pkl"  
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
predictions = trained_model.predict(test_embeddings)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
test_data['Predicted_Class'] = predicted_labels
output_file_path = './TA-AT-test-predictions.xlsx'
test_data.to_excel(output_file_path, index=False)
print(f"Predictions saved to: {output_file_path}")

Device: cpu




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Predictions saved to: ./TA-AT-test-predictions.xlsx


TELUGU using *bert-base-multilingual-cased*

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def load_dataset(base_dir='/content', lang='tel'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TE-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"} 
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dft = load_dataset()

Using device: cpu


In [None]:
dataset_dft

Unnamed: 0,audio_path,transcript,class_label,gender
0,Nil,ఈ కాలంలో మీరు ఒకసారి ఒబ్సర్వ్ చేయండి మన స్టేటు...,R,Unknown
1,Nil,హిందూ థర్మాన్ని ఎవరేమన్నా కూడా వాడికొచ్చే ఒకేఒ...,R,Unknown
2,Nil,ఒక ముస్లింనిగాని. ఒక్క నిమిషం భార్గవి కల్యాణిగ...,R,Unknown
3,Nil,ఈ సెక్యులర్ ఇండియా డెమోక్రాటిక్ ఇండియా అని మాట...,R,Unknown
4,Nil,"హిందువులున్న ఈ భారతదేశంలో ,ఈ భారతదేశంలో , సనాత...",R,Unknown
...,...,...,...,...
551,Nil,బాగుండటం అంటే బాగా ఉండటం కాదత్తా నలుగురితో ఉండ...,N,Unknown
552,Nil,దేవుడు డెఫినిషన్ అర్ధమయిపోయింది బయ్యా ఆడు అక్క...,N,Unknown
553,Nil,మనిషిని ప్రేమిస్తే అబధం విలువ తెలుస్తుంది కరెక...,N,Unknown
554,Nil,ఇంకో విచిత్రం ఏంటంటే మన లైఫ్ ని ఇంటరెస్టింగ్ గ...,N,Unknown


In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['telugu']))
def preprocess_tel_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("te")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_dft['cleaned_transcript'] = dataset_dft['transcript'].apply(preprocess_tel_text)
label_encoder = LabelEncoder()
dataset_dft['encoded_label'] = label_encoder.fit_transform(dataset_dft['class_label'])
label_encoder_path = "tel_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

Label encoder saved to tel_label_encoder.pkl


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_dft['cleaned_transcript'], dataset_dft['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings("bert-base-multilingual-cased", X_train.tolist())
X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", X_test.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=30, batch_size=32
)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.2061 - loss: 2.7811 - val_accuracy: 0.3929 - val_loss: 1.4347
Epoch 2/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4898 - loss: 1.5589 - val_accuracy: 0.4464 - val_loss: 1.3518
Epoch 3/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4829 - loss: 1.4684 - val_accuracy: 0.4018 - val_loss: 1.3560
Epoch 4/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5293 - loss: 1.2298 - val_accuracy: 0.4375 - val_loss: 1.3111
Epoch 5/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5928 - loss: 1.1279 - val_accuracy: 0.4286 - val_loss: 1.2913
Epoch 6/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.6197 - loss: 0.9703 - val_accuracy: 0.4286 - val_loss: 1.2620
Epoch 7/30
[1m14/14[0m [32m━━━━

In [None]:
from tensorflow.keras.models import load_model
model.save("tel_classification_model.h5")
print("Model saved as 'tel_classification_model.h5'")
def test_model(input_text, model_path="tel_classification_model.h5"):
    loaded_model = load_model(model_path)
    print("Model loaded successfully.")
    preprocessed_text = preprocess_tel_text(input_text)
    embedding = extract_embeddings("bert-base-multilingual-cased", [preprocessed_text])
    prediction = loaded_model.predict(embedding)
    predicted_label_index = np.argmax(prediction)
    predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]
    print(f"Input Text: {input_text}")
    print(f"Predicted Label: {predicted_label}")
    return predicted_label
new_text ="మతంపై ప్రతికూల వ్యాఖ్యలు చేసిన వ్యక్తులు చట్టం ముందు తప్పించుకోలేరు."
predicted_label = test_model(new_text)




Model saved as 'tel_classification_model.h5'
Model loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Input Text: మతంపై ప్రతికూల వ్యాఖ్యలు చేసిన వ్యక్తులు చట్టం ముందు తప్పించుకోలేరు.
Predicted Label: N


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
stopwords = list(sorted(adv.stopwords['telugu']))
def preprocess_Tel_text(text):
    """Preprocess Tamil text."""
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    processed_text = ' '.join(tokens)
    return processed_text
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
test_file_path = '/content/TE-AT-test.xlsx'  
test_data = pd.read_excel(test_file_path)
test_data['cleaned_transcript'] = test_data['Transcript'].apply(preprocess_Tel_text)
bert_model_name = "bert-base-multilingual-cased"
test_embeddings = extract_embeddings(bert_model_name, test_data['cleaned_transcript'].tolist())
trained_model_path = "/content/tel_classification_model.h5"  
trained_model = load_model(trained_model_path)
label_encoder_path = "/content/tel_label_encoder.pkl"  
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
predictions = trained_model.predict(test_embeddings)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
test_data['Predicted_Class'] = predicted_labels
output_file_path = './tel-AT-test-predictions.xlsx'
test_data.to_excel(output_file_path, index=False)
print(f"Predictions saved to: {output_file_path}")

Device: cpu




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
Predictions saved to: ./tel-AT-test-predictions.xlsx


MALAYALAM using *bert-base-multilingual-cased*

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def load_dataset(base_dir='/content', lang='mal'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "ML-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dfm = load_dataset()

Using device: cpu


In [None]:
dataset_dfm

Unnamed: 0,audio_path,transcript,class_label,gender
0,Nil,നമസ്കാരം ഒരു ഒരു പരമ ചെറ്റയുടെ കാര്യമാണ് ഞാൻ പ...,C,Unknown
1,Nil,ആദ്യം തന്നെ അവൻറെ ഐഡിയുടെ പേര് വരെ ഞാൻ ഇതിനകത്...,C,Unknown
2,Nil,അവൻറെ ആ ചെറ്റയുടെ ആ പരമനാറിയുടെ പേര്,C,Unknown
3,Nil,അവന്റെ ദുഷിച്ച മനസ്സ് കൊണ്ടുവന്ന് എൻറെ വീഡിയോയ...,C,Unknown
4,Nil,നിൻറെ ദുഷിപ്പ് എല്ലാം എന്തിനാ എന്റെ നേർക്ക് തീ...,C,Unknown
...,...,...,...,...
878,Nil,ഒന്നിനെയും കൂടുതൽ ആശ്രയിക്കാതെ ഇരിക്കുക ഒരിക്ക...,N,Unknown
879,Nil,പിന്തുടർച്ച ഇല്ലാത്ത പ്രവർത്തികൾക്ക് ഒന്നും യാ...,N,Unknown
880,Nil,ഏറ്റവും അടുപ്പമുള്ളവരാണ് അതിസമർഥമായി ചതിക്കുന്...,N,Unknown
881,Nil,സാഹചര്യം ഏതായാലും കീടങ്ങള് വേണ്ടയോ എന്ന് തീരുമ...,N,Unknown


In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]
def preprocess_malayalam_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)
dataset_dfm['cleaned_transcript'] = dataset_dfm['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_dfm['encoded_label'] = label_encoder.fit_transform(dataset_dfm['class_label'])
label_encoder_path = "mal_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

Label encoder saved to mal_label_encoder.pkl


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_dfm['cleaned_transcript'], dataset_dfm['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings("bert-base-multilingual-cased", X_train.tolist())
X_test_embeddings = extract_embeddings("bert-base-multilingual-cased", X_test.tolist())

In [None]:
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=35, batch_size=32)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.3351 - loss: 1.9655 - val_accuracy: 0.5537 - val_loss: 1.1763
Epoch 2/35
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.5937 - loss: 1.2021 - val_accuracy: 0.6158 - val_loss: 1.0844
Epoch 3/35
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.6901 - loss: 0.9284 - val_accuracy: 0.6102 - val_loss: 1.0266
Epoch 4/35
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.6917 - loss: 0.8747 - val_accuracy: 0.6441 - val_loss: 0.9980
Epoch 5/35
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6844 - loss: 0.8500 - val_accuracy: 0.6667 - val_loss: 0.9372
Epoch 6/35
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7612 - loss: 0.6474 - val_accuracy: 0.6836 - val_loss: 0.9334
Epoch 7/35
[1m23/23[0m [32m━━━━

In [None]:
from tensorflow.keras.models import load_model
model.save("mal_classification_model.h5")
print("Model saved as 'mal_classification_model.h5'")
def test_model(input_text, model_path="mal_classification_model.h5"):
    loaded_model = load_model(model_path)
    print("Model loaded successfully.")
    preprocessed_text = preprocess_malayalam_text(input_text)
    embedding = extract_embeddings("bert-base-multilingual-cased", [preprocessed_text])
    prediction = loaded_model.predict(embedding)
    predicted_label_index = np.argmax(prediction)
    predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]
    print(f"Input Text: {input_text}")
    print(f"Predicted Label: {predicted_label}")
    return predicted_label
new_text ="മതങ്ങൾ ഉൾപ്പെടെ എല്ലാവരും സമാധാനത്തിനായി പ്രവർത്തിക്കണം."
predicted_label = test_model(new_text)



Model saved as 'mal_classification_model.h5'
Model loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
Input Text: മതങ്ങൾ ഉൾപ്പെടെ എല്ലാവരും സമാധാനത്തിനായി പ്രവർത്തിക്കണം.
Predicted Label: N


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_Mal_text(text):
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    processed_text = ' '.join(tokens)
    return processed_text
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
test_file_path = '/content/ML-AT-test.xlsx'  
test_data = pd.read_excel(test_file_path)
test_data['cleaned_transcript'] = test_data['Transcript'].apply(preprocess_Mal_text)
bert_model_name = "bert-base-multilingual-cased"
test_embeddings = extract_embeddings(bert_model_name, test_data['cleaned_transcript'].tolist())
trained_model_path = "/content/mal_classification_model.h5"  
trained_model = load_model(trained_model_path)
label_encoder_path = "/content/mal_label_encoder.pkl"  
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
predictions = trained_model.predict(test_embeddings)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
test_data['Predicted_Class'] = predicted_labels
output_file_path = './ml-AT-test-predictions.xlsx'
test_data.to_excel(output_file_path, index=False)
print(f"Predictions saved to: {output_file_path}")


Device: cpu




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Predictions saved to: ./ml-AT-test-predictions.xlsx


TAMIL Adding TFIDFVectorizer


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
def load_dataset(base_dir='/content', lang='tamil'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TA-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_df = load_dataset()
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_tamil_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])
label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()
print(f"TF-IDF Vectorized Train Data Shape: {X_train_tfidf.shape}")
print(f"TF-IDF Vectorized Test Data Shape: {X_test_tfidf.shape}")
model = Sequential([
    Dense(256, input_dim=X_train_tfidf.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
history = model.fit(
    X_train_tfidf, y_train_cat,
    validation_data=(X_test_tfidf, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_tfidf, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_tfidf)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

Label encoder saved to tamil_label_encoder.pkl
TF-IDF Vectorized Train Data Shape: (411, 670)
TF-IDF Vectorized Test Data Shape: (103, 670)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.1994 - loss: 2.6964 - val_accuracy: 0.5631 - val_loss: 1.5463
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3313 - loss: 1.9941 - val_accuracy: 0.5631 - val_loss: 1.5084
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4344 - loss: 1.6703 - val_accuracy: 0.5631 - val_loss: 1.4702
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4945 - loss: 1.3898 - val_accuracy: 0.5631 - val_loss: 1.4325
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5644 - loss: 1.1251 - val_accuracy: 0.5631 - val_loss: 1.3975
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6137 - loss: 1.0332 - val_accuracy: 0.5631 - val_loss: 1.3597
Epoch 7/35
[1m13/13[0m [32m━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


MALAYALAM Adding TFIDFVectorizer


In [None]:
def load_dataset(base_dir='/content', lang='mal'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "ML-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dfm = load_dataset()

In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]
def preprocess_malayalam_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)
dataset_dfm['cleaned_transcript'] = dataset_dfm['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_dfm['encoded_label'] = label_encoder.fit_transform(dataset_dfm['class_label'])
label_encoder_path = "mal_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")


Label encoder saved to mal_label_encoder.pkl


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()
print(f"TF-IDF Vectorized Train Data Shape: {X_train_tfidf.shape}")
print(f"TF-IDF Vectorized Test Data Shape: {X_test_tfidf.shape}")
model = Sequential([
    Dense(256, input_dim=X_train_tfidf.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
history = model.fit(
    X_train_tfidf, y_train_cat,
    validation_data=(X_test_tfidf, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_tfidf, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_tfidf)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

TF-IDF Vectorized Train Data Shape: (411, 670)
TF-IDF Vectorized Test Data Shape: (103, 670)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.2270 - loss: 2.7338 - val_accuracy: 0.5631 - val_loss: 1.5024
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3527 - loss: 2.2091 - val_accuracy: 0.5631 - val_loss: 1.4106
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4269 - loss: 1.6237 - val_accuracy: 0.5631 - val_loss: 1.3440
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5524 - loss: 1.1859 - val_accuracy: 0.5631 - val_loss: 1.2914
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6036 - loss: 1.1035 - val_accuracy: 0.5631 - val_loss: 1.2607
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6832 - loss: 0.8210 - val_accuracy: 0.5631 - val_loss: 1.2452
Epoch 7/35
[1m13/13[0m [32m━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



TELUGU Adding TFIDFVectorizer


In [None]:
def load_dataset(base_dir='/content', lang='tel'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TE-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dft = load_dataset()

In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['telugu']))
def preprocess_tel_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("te")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_dft['cleaned_transcript'] = dataset_dft['transcript'].apply(preprocess_tel_text)
label_encoder = LabelEncoder()
dataset_dft['encoded_label'] = label_encoder.fit_transform(dataset_dft['class_label'])
label_encoder_path = "tel_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

Label encoder saved to tel_label_encoder.pkl


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()
print(f"TF-IDF Vectorized Train Data Shape: {X_train_tfidf.shape}")
print(f"TF-IDF Vectorized Test Data Shape: {X_test_tfidf.shape}")
model = Sequential([
    Dense(256, input_dim=X_train_tfidf.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
history = model.fit(
    X_train_tfidf, y_train_cat,
    validation_data=(X_test_tfidf, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_tfidf, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_tfidf)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

TF-IDF Vectorized Train Data Shape: (411, 670)
TF-IDF Vectorized Test Data Shape: (103, 670)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.2206 - loss: 2.6080 - val_accuracy: 0.5534 - val_loss: 1.5713
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.3192 - loss: 2.0895 - val_accuracy: 0.5631 - val_loss: 1.5296
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.3814 - loss: 1.7429 - val_accuracy: 0.5631 - val_loss: 1.4944
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5303 - loss: 1.3517 - val_accuracy: 0.5631 - val_loss: 1.4597
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5376 - loss: 1.1478 - val_accuracy: 0.5631 - val_loss: 1.4268
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.6762 - loss: 0.9587 - val_accuracy: 0.5631 - val_loss: 1.3898
Epoch 7/35
[1m13/13[0m [32m━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TAMIL Adding COUNT Vectorizer

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def load_dataset(base_dir='/content', lang='tamil'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TA-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_df = load_dataset()
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_tamil_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])
label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
count_vectorizer = CountVectorizer(max_features=5000)  
X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.transform(X_test).toarray()
print(f"Count Vectorized Train Data Shape: {X_train_count.shape}")
print(f"Count Vectorized Test Data Shape: {X_test_count.shape}")
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_count.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train_count, y_train_cat,
    validation_data=(X_test_count, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_count, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_count)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

Using device: cpu
Label encoder saved to tamil_label_encoder.pkl
Count Vectorized Train Data Shape: (411, 670)
Count Vectorized Test Data Shape: (103, 670)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.2101 - loss: 2.4980 - val_accuracy: 0.4272 - val_loss: 1.5137
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3183 - loss: 2.0600 - val_accuracy: 0.5534 - val_loss: 1.4021
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3448 - loss: 1.8411 - val_accuracy: 0.5631 - val_loss: 1.3414
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4600 - loss: 1.5176 - val_accuracy: 0.5922 - val_loss: 1.2931
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5260 - loss: 1.2842 - val_accuracy: 0.5922 - val_loss: 1.2459
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5553 - loss: 1.2173 - val_accuracy: 0.5922 - val_loss: 1.2114
Epoch 7/35
[1m13/13[0m [32m━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


MALAYALAM Adding COUNT Vectorizer

In [None]:
def load_dataset(base_dir='/content', lang='mal'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "ML-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dfm = load_dataset()

In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]
def preprocess_malayalam_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)
dataset_dfm['cleaned_transcript'] = dataset_dfm['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_dfm['encoded_label'] = label_encoder.fit_transform(dataset_dfm['class_label'])
label_encoder_path = "mal_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

Label encoder saved to mal_label_encoder.pkl


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  
X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.transform(X_test).toarray()
print(f"Count Vectorized Train Data Shape: {X_train_count.shape}")
print(f"Count Vectorized Test Data Shape: {X_test_count.shape}")
model = Sequential([
    Dense(256, input_dim=X_train_count.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
history = model.fit(
    X_train_count, y_train_cat,
    validation_data=(X_test_count, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_count, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_count)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

Count Vectorized Train Data Shape: (411, 670)
Count Vectorized Test Data Shape: (103, 670)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.2197 - loss: 2.6895 - val_accuracy: 0.3786 - val_loss: 1.5389
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2872 - loss: 2.2527 - val_accuracy: 0.4660 - val_loss: 1.4912
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3833 - loss: 1.8541 - val_accuracy: 0.4466 - val_loss: 1.4466
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4337 - loss: 1.5306 - val_accuracy: 0.4466 - val_loss: 1.4305
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5087 - loss: 1.3355 - val_accuracy: 0.4369 - val_loss: 1.4039
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5647 - loss: 1.1715 - val_accuracy: 0.4563 - val_loss: 1.3759
Epoch 7/35
[1m13/13[0m [32m━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TELUGU Adding COUNT Vectorizer

In [None]:
def load_dataset(base_dir='/content', lang='tel'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TE-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dft = load_dataset()

In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['telugu']))
def preprocess_tel_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("te")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_dft['cleaned_transcript'] = dataset_dft['transcript'].apply(preprocess_tel_text)
label_encoder = LabelEncoder()
dataset_dft['encoded_label'] = label_encoder.fit_transform(dataset_dft['class_label'])
label_encoder_path = "tel_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)

Label encoder saved to tel_label_encoder.pkl


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  
X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.transform(X_test).toarray()
print(f"Count Vectorized Train Data Shape: {X_train_count.shape}")
print(f"Count Vectorized Test Data Shape: {X_test_count.shape}")
model = Sequential([
    Dense(256, input_dim=X_train_count.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
history = model.fit(
    X_train_count, y_train_cat,
    validation_data=(X_test_count, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_count, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_count)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

Count Vectorized Train Data Shape: (411, 670)
Count Vectorized Test Data Shape: (103, 670)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 63ms/step - accuracy: 0.1520 - loss: 2.8512 - val_accuracy: 0.3010 - val_loss: 1.5810
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.2717 - loss: 2.1983 - val_accuracy: 0.3204 - val_loss: 1.5433
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.3469 - loss: 1.8605 - val_accuracy: 0.3495 - val_loss: 1.5197
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.4736 - loss: 1.3941 - val_accuracy: 0.3689 - val_loss: 1.4886
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5136 - loss: 1.3353 - val_accuracy: 0.4272 - val_loss: 1.4647
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.5806 - loss: 1.0836 - val_accuracy: 0.4660 - val_loss: 1.4257
Epoch 7/35
[1m13/13[0m [32m━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TAMIL *xlm-roberta-base*

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def load_dataset(base_dir='/content', lang='tamil'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TA-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_df = load_dataset()
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_tamil_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_df['encoded_label'] = label_encoder.fit_transform(dataset_df['class_label'])
label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings_xlmr(model_name, texts):
    """Extract embeddings for the given texts using a pre-trained XLM-RoBERTa model."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            encoded_inputs = tokenizer(
                batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
            )
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings_xlmr("xlm-roberta-base", X_train.tolist())
X_test_embeddings = extract_embeddings_xlmr("xlm-roberta-base", X_test.tolist())
print(f"XLM-RoBERTa Train Embeddings Shape: {X_train_embeddings.shape}")
print(f"XLM-RoBERTa Test Embeddings Shape: {X_test_embeddings.shape}")
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

Using device: cpu
Label encoder saved to tamil_label_encoder.pkl


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

XLM-RoBERTa Train Embeddings Shape: (411, 768)
XLM-RoBERTa Test Embeddings Shape: (103, 768)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.2370 - loss: 2.4564 - val_accuracy: 0.0874 - val_loss: 1.7722
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3746 - loss: 1.9316 - val_accuracy: 0.0874 - val_loss: 1.7021
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4618 - loss: 1.6776 - val_accuracy: 0.1068 - val_loss: 1.6203
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5427 - loss: 1.4519 - val_accuracy: 0.4272 - val_loss: 1.5070
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5048 - loss: 1.4641 - val_accuracy: 0.5049 - val_loss: 1.4341
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5973 - loss: 1.3465 - val_accuracy: 0.5437 - val_loss: 1.3742
Epoch 7/35
[1m13/13[0m [32m━━━━━━━━━

TELUGU *xlm-roberta-base*

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def load_dataset(base_dir='/content', lang='tel'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TE-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dft = load_dataset()
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stopwords = list(sorted(adv.stopwords['telugu']))
def preprocess_tel_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("te")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_dft['cleaned_transcript'] = dataset_dft['transcript'].apply(preprocess_tel_text)
label_encoder = LabelEncoder()
dataset_dft['encoded_label'] = label_encoder.fit_transform(dataset_dft['class_label'])
label_encoder_path = "tel_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings_exbert(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            encoded_inputs = tokenizer(
                batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
            )
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings_xlmr("xlm-roberta-base", X_train.tolist())
X_test_embeddings = extract_embeddings_xlmr("xlm-roberta-base", X_test.tolist())
print(f"XLM-RoBERTa Train Embeddings Shape: {X_train_embeddings.shape}")
print(f"XLM-RoBERTa Test Embeddings Shape: {X_test_embeddings.shape}")
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

Using device: cpu
Label encoder saved to tel_label_encoder.pkl
XLM-RoBERTa Train Embeddings Shape: (411, 768)
XLM-RoBERTa Test Embeddings Shape: (103, 768)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.3261 - loss: 2.3492 - val_accuracy: 0.1553 - val_loss: 2.3514
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4004 - loss: 1.8629 - val_accuracy: 0.1553 - val_loss: 2.1389
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5182 - loss: 1.5583 - val_accuracy: 0.1553 - val_loss: 1.9581
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5802 - loss: 1.3393 - val_accuracy: 0.1553 - val_loss: 1.7831
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5748 - loss: 1.2853 - val_accuracy: 0.1553 - val_loss: 1.6198
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6159 - loss: 1.1657 - val_accuracy: 0.1650 - val_loss: 1.5153
Epoch 7/35
[1m13/13[0m [32m━━━━━

MALAYALAM *xlm-roberta-base*

In [None]:
def load_dataset(base_dir='/content', lang='mal'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "ML-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}  
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_dfm = load_dataset()

In [None]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import advertools as adv
import pickle
stop = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]
def preprocess_malayalam_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stop]
    return ' '.join(tokens)
dataset_dfm['cleaned_transcript'] = dataset_dfm['transcript'].apply(preprocess_tamil_text)
label_encoder = LabelEncoder()
dataset_dfm['encoded_label'] = label_encoder.fit_transform(dataset_dfm['class_label'])
label_encoder_path = "mal_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")

Label encoder saved to mal_label_encoder.pkl


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset_df['cleaned_transcript'], dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings_exbert(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            encoded_inputs = tokenizer(
                batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
            )
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings_xlmr("xlm-roberta-base", X_train.tolist())
X_test_embeddings = extract_embeddings_xlmr("xlm-roberta-base", X_test.tolist())
print(f"XLM-RoBERTa Train Embeddings Shape: {X_train_embeddings.shape}")
print(f"XLM-RoBERTa Test Embeddings Shape: {X_test_embeddings.shape}")
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
model = Sequential([
    Dense(256, input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=35, batch_size=32
)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

XLM-RoBERTa Train Embeddings Shape: (411, 768)
XLM-RoBERTa Test Embeddings Shape: (103, 768)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.2266 - loss: 2.4651 - val_accuracy: 0.1165 - val_loss: 2.0876
Epoch 2/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3862 - loss: 1.8828 - val_accuracy: 0.1165 - val_loss: 1.9873
Epoch 3/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4516 - loss: 1.6964 - val_accuracy: 0.1165 - val_loss: 1.9304
Epoch 4/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5607 - loss: 1.3897 - val_accuracy: 0.1359 - val_loss: 1.8189
Epoch 5/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5974 - loss: 1.3386 - val_accuracy: 0.1359 - val_loss: 1.6918
Epoch 6/35
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6002 - loss: 1.1645 - val_accuracy: 0.1359 - val_loss: 1.6527
Epoch 7/35
[1m13/13[0m [32m━━━━━━━━━



[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 89ms/step



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
              precision    recall  f1-score   support

           C       0.47      0.56      0.51        16
           G       0.50      0.56      0.53         9
           N       0.82      0.91      0.86        58
           P       1.00      0.25      0.40         8
           R       0.71      0.42      0.53        12

    accuracy                           0.72       103
   macro avg       0.70      0.54      0.57       103
weighted avg       0.74      0.72      0.70       103

