Tamil

In [None]:
import advertools as adv

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
import nlpaug.augmenter.word as naw
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import nltk
import optuna
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_dataset(base_dir='/content', lang='tamil'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TA-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)

dataset_df = load_dataset()

In [None]:
stopwords = list(sorted(adv.stopwords['tamil']))

def preprocess_tamil_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ta")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tamil_text)
def augment_text(text, num_augments=2):
    aug = naw.SynonymAug(aug_src='wordnet', lang='tam')  
    augmented_texts = []
    try:
        for _ in range(num_augments):
            augmented_texts.append(aug.augment(text))
    except Exception:
        augmented_texts.append(text)
    return augmented_texts

In [None]:
augmented_data = []
for _, row in dataset_df.iterrows():
    augmented_transcripts = augment_text(row['cleaned_transcript'], num_augments=2)
    for aug_text in augmented_transcripts:
        augmented_data.append({
            "transcript": aug_text,
            "class_label": row['class_label']
        })
augmented_df = pd.DataFrame(augmented_data)
full_dataset_df = pd.concat([dataset_df, augmented_df], ignore_index=True)
label_encoder = LabelEncoder()
full_dataset_df['encoded_label'] = label_encoder.fit_transform(full_dataset_df['class_label'])
label_encoder_path = "tamil_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptro

Label encoder saved to tamil_label_encoder.pkl


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    full_dataset_df['transcript'], full_dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings("xlm-roberta-large", X_train.tolist())
X_test_embeddings = extract_embeddings("xlm-roberta-large", X_test.tolist())

In [None]:
def objective(trial):
    num_units_1 = trial.suggest_int("num_units_1", 128, 512)
    num_units_2 = trial.suggest_int("num_units_2", 64, 256)
    dropout_rate_1 = trial.suggest_float("dropout_rate_1", 0.2, 0.5)
    dropout_rate_2 = trial.suggest_float("dropout_rate_2", 0.1, 0.4)
    batch_size = trial.suggest_int("batch_size", 16, 64)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
    model = Sequential([
        Dense(num_units_1, input_dim=X_train_embeddings.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate_1),
        Dense(num_units_2, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate_2),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    y_train_cat = to_categorical(y_train)
    history = model.fit(
        X_train_embeddings, y_train_cat,
        validation_split=0.2, epochs=20, batch_size=batch_size, verbose=0
    )
    val_loss = history.history['val_loss'][-1]
    return val_loss
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)
best_params = study.best_params
print("Best hyperparameters:", best_params)
model = Sequential([
    Dense(best_params['num_units_1'], input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(best_params['dropout_rate_1']),
    Dense(best_params['num_units_2'], activation='relu'),
    BatchNormalization(),
    Dropout(best_params['dropout_rate_2']),
    Dense(len(label_encoder.classes_), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
history = model.fit(
    X_train_embeddings, y_train_cat,
    validation_data=(X_test_embeddings, y_test_cat),
    epochs=100, batch_size=best_params['batch_size']
)
loss, accuracy = model.evaluate(X_test_embeddings, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")
y_pred = model.predict(X_test_embeddings)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

[I 2025-01-09 14:08:52,356] A new study created in memory with name: no-name-4de810ff-54c1-41cc-89b9-715b88921f82
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
[I 2025-01-09 14:09:01,397] Trial 0 finished with value: 0.5255210995674133 and parameters: {'num_units_1': 397, 'num_units_2': 116, 'dropout_rate_1': 0.33223325283250327, 'dropout_rate_2': 0.283030780978207, 'batch_size': 29, 'learning_rate': 0.0026683399277697807}. Best is trial 0 with value: 0.5255210995674133.
[I 2025-01-09 14:09:11,448] Trial 1 finished with value: 1.0691577196121216 and parameters: {'num_units_1': 396, 'num_units_2': 106, 'dropout_rate_1': 0.2048708289980794, 'dropout_rate_2': 0.35825112508917634, 'batch_size': 21, 'learning_rate': 0.0014217226760367182}. Best is trial 0 with value: 0.5255210995674133.
[I 2025-01-09 14:09:17,143] Trial 2 finished with value: 1.1530791521072388 and parameters: {'num_units_1': 266, 'num_units_2': 94, 'dropout_rate_1': 0.2336586719453222, 'dropout_ra

Best hyperparameters: {'num_units_1': 382, 'num_units_2': 158, 'dropout_rate_1': 0.24663513734779172, 'dropout_rate_2': 0.17273137979778136, 'batch_size': 29, 'learning_rate': 0.008934613266542227}


Epoch 1/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.3591 - loss: 1.8794 - val_accuracy: 0.1068 - val_loss: 2.4300
Epoch 2/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.6889 - loss: 0.9103 - val_accuracy: 0.1068 - val_loss: 2.1271
Epoch 3/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.7754 - loss: 0.6675 - val_accuracy: 0.1068 - val_loss: 1.9569
Epoch 4/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8224 - loss: 0.5232 - val_accuracy: 0.1456 - val_loss: 1.5314
Epoch 5/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9012 - loss: 0.3283 - val_accuracy: 0.6456 - val_loss: 1.1042
Epoch 6/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9302 - loss: 0.2256 - val_accuracy: 0.6942 - val_loss: 1.0480
Epoch 7/100
[1m29/29[0m [

In [None]:
model.save("best_tamil_hyper_classification_model.h5")
print("Model saved as 'best_tamil_hyper_classification_model.h5'")



Model saved as 'best_tamil_hyper_classification_model.h5'


In [None]:
from tensorflow.keras.models import load_model
import pickle
model = load_model("/content/best_tamil_hyper_classification_model.h5")
with open("tamil_label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)
custom_input = "மற்ற மதங்களை ஆதரிக்கிறவர்கள் எல்லாம் துரோகிகள்."  
custom_input_cleaned = preprocess_tamil_text(custom_input)
custom_input_embedding = extract_embeddings("xlm-roberta-large", [custom_input_cleaned])
predicted_label_index = np.argmax(model.predict(custom_input_embedding), axis=1)[0]
predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]
print(f"Predicted Label: {predicted_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
Predicted Label: R


TELUGU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_dataset(base_dir='/content', lang='tel'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "TE-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}
        dataset.append({
            "audio_path": "Nil",   
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)

dataset_df = load_dataset()
stopwords = []  
def preprocess_tel_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("te")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_tel_text)
def augment_text(text, num_augments=2):
    aug = naw.SynonymAug(aug_src='wordnet', lang='tel')  
    augmented_texts = []
    try:
        for _ in range(num_augments):
            augmented_texts.append(aug.augment(text))
    except Exception:
        augmented_texts.append(text)
    return augmented_texts
augmented_data = []
for _, row in dataset_df.iterrows():
    augmented_transcripts = augment_text(row['cleaned_transcript'], num_augments=2)
    for aug_text in augmented_transcripts:
        augmented_data.append({
            "transcript": aug_text,
            "class_label": row['class_label']
        })
augmented_df = pd.DataFrame(augmented_data)
full_dataset_df = pd.concat([dataset_df, augmented_df], ignore_index=True)
label_encoder = LabelEncoder()
full_dataset_df['encoded_label'] = label_encoder.fit_transform(full_dataset_df['class_label'])
label_encoder_path = "tel_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptro

Label encoder saved to tel_label_encoder.pkl


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    full_dataset_df['transcript'], full_dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings("xlm-roberta-large", X_train.tolist())
X_test_embeddings = extract_embeddings("xlm-roberta-large", X_test.tolist())
def objective(trial):
    num_units_1 = trial.suggest_int("num_units_1", 128, 512)
    num_units_2 = trial.suggest_int("num_units_2", 64, 256)
    dropout_rate_1 = trial.suggest_float("dropout_rate_1", 0.2, 0.5)
    dropout_rate_2 = trial.suggest_float("dropout_rate_2", 0.1, 0.4)
    batch_size = trial.suggest_int("batch_size", 16, 64)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
    model = Sequential([
        Dense(num_units_1, input_dim=X_train_embeddings.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate_1),
        Dense(num_units_2, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate_2),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    y_train_cat = to_categorical(y_train)
    history = model.fit(
        X_train_embeddings, y_train_cat,
        epochs=10, batch_size=batch_size,
        validation_split=0.2, verbose=0
    )
    val_loss = min(history.history['val_loss'])
    return val_loss
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
best_params = study.best_params
final_model = Sequential([
    Dense(best_params["num_units_1"], input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(best_params["dropout_rate_1"]),
    Dense(best_params["num_units_2"], activation='relu'),
    BatchNormalization(),
    Dropout(best_params["dropout_rate_2"]),
    Dense(len(label_encoder.classes_), activation='softmax')
])
final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
y_train_cat = to_categorical(y_train)
final_model.fit(
    X_train_embeddings, y_train_cat,
    epochs=100, batch_size=best_params["batch_size"],
    validation_split=0.2, verbose=1
)
y_test_cat = to_categorical(y_test)
test_loss, test_accuracy = final_model.evaluate(X_test_embeddings, y_test_cat, verbose=1)
print(f"Test Accuracy: {test_accuracy}")
y_pred = np.argmax(final_model.predict(X_test_embeddings), axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


[I 2025-01-09 14:36:22,307] A new study created in memory with name: no-name-213a4b38-da9b-4250-bfb7-3c512c659cb7
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
[I 2025-01-09 14:36:27,683] Trial 0 finished with value: 0.807628333568573 and parameters: {'num_units_1': 433, 'num_units_2': 165, 'dropout_rate_1': 0.47996842064226874, 'dropout_rate_2': 0.21204605200199675, 'batch_size': 31, 'learning_rate': 0.00909812291812723}. Best is trial 0 with value: 0.807628333568573.
[I 2025-01-09 14:36:33,590] Trial 1 finished with value: 0.9258256554603577 and parameters: {'num_units_1': 279, 'num_units_2': 197, 'dropout_rate_1': 0.3935509439937255, 'dropout_rate_2': 0.301804598744449, 'batch_size': 36, 'learning_rate': 0.003329627543525666}. Best is trial 0 with value: 0.807628333568573.
[I 2025-01-09 14:36:38,800] Trial 2 finished with value: 1.0021103620529175 and parameters: {'num_units_1': 500, 'num_units_2': 92, 'dropout_rate_1': 0.45669575071925644, 'dropout_rate_2'

Epoch 1/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.3989 - loss: 1.6799 - val_accuracy: 0.3483 - val_loss: 1.4782
Epoch 2/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6955 - loss: 0.8597 - val_accuracy: 0.3596 - val_loss: 1.3611
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8041 - loss: 0.5780 - val_accuracy: 0.3596 - val_loss: 1.4231
Epoch 4/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8456 - loss: 0.4591 - val_accuracy: 0.6348 - val_loss: 1.0642
Epoch 5/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8842 - loss: 0.3905 - val_accuracy: 0.5674 - val_loss: 1.0815
Epoch 6/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8928 - loss: 0.3076 - val_accuracy: 0.8202 - val_loss: 0.7485
Epoch 7/100
[1m45/45[0m [32m━━

In [None]:
model.save("best_tel_hyper_classification_model.h5")
print("Model saved as 'best_tel_hyper_classification_model.h5'")



Model saved as 'best_tel_hyper_classification_model.h5'


In [None]:
from tensorflow.keras.models import load_model
import pickle
model = load_model("/content/best_tel_hyper_classification_model.h5")
with open("tel_label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)
custom_input = "ప్రతీ మతం శాంతి మరియు ప్రేమను ప్రోత్సహిస్తుంది."  
custom_input_cleaned = preprocess_tamil_text(custom_input)
custom_input_embedding = extract_embeddings("xlm-roberta-large", [custom_input_cleaned])
predicted_label_index = np.argmax(model.predict(custom_input_embedding), axis=1)[0]
predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]
print(f"Predicted Label: {predicted_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
Predicted Label: N


MALAYALAM

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_dataset(base_dir='/content', lang='mal'):
    dataset = []
    text_file = os.path.join(base_dir, lang, "text", "ML-AT-train.xlsx")
    text_df = pd.read_excel(text_file)
    for _, row in text_df.iterrows():
        metadata = {"class_label": row["Class Label Short"], "gender": "Unknown"}
        dataset.append({
            "audio_path": "Nil",  
            "transcript": row["Transcript"],
            "class_label": metadata["class_label"],
            "gender": metadata["gender"]
        })
    return pd.DataFrame(dataset)
dataset_df = load_dataset()
stopwords = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]
def preprocess_malayalam_text(text):
    normalizer_factory = IndicNormalizerFactory()
    normalizer = normalizer_factory.get_normalizer("ml")
    text = normalizer.normalize(text)
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
dataset_df['cleaned_transcript'] = dataset_df['transcript'].apply(preprocess_malayalam_text)
def augment_text(text, num_augments=2):
    aug = naw.SynonymAug(aug_src='wordnet')  
    augmented_texts = []
    try:
        for _ in range(num_augments):
            augmented_texts.append(aug.augment(text))
    except Exception:
        augmented_texts.append(text)  
    return augmented_texts
augmented_data = []
for _, row in dataset_df.iterrows():
    augmented_transcripts = augment_text(row['cleaned_transcript'], num_augments=2)
    for aug_text in augmented_transcripts:
        augmented_data.append({
            "transcript": aug_text,
            "class_label": row['class_label']
        })
augmented_df = pd.DataFrame(augmented_data)
full_dataset_df = pd.concat([dataset_df, augmented_df], ignore_index=True)
label_encoder = LabelEncoder()
full_dataset_df['encoded_label'] = label_encoder.fit_transform(full_dataset_df['class_label'])
label_encoder_path = "malayalam_label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print(f"Label encoder saved to {label_encoder_path}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptro

Label encoder saved to malayalam_label_encoder.pkl


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger t

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    full_dataset_df['transcript'], full_dataset_df['encoded_label'], test_size=0.2, random_state=42
)
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            encoded_inputs = {key: tensor.to(device) for key, tensor in encoded_inputs.items()}
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
X_train_embeddings = extract_embeddings("xlm-roberta-large", X_train.tolist())
X_test_embeddings = extract_embeddings("xlm-roberta-large", X_test.tolist())
def objective(trial):
    num_units_1 = trial.suggest_int("num_units_1", 128, 512)
    num_units_2 = trial.suggest_int("num_units_2", 64, 256)
    dropout_rate_1 = trial.suggest_float("dropout_rate_1", 0.2, 0.5)
    dropout_rate_2 = trial.suggest_float("dropout_rate_2", 0.1, 0.4)
    batch_size = trial.suggest_int("batch_size", 16, 64)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2)
    model = Sequential([
        Dense(num_units_1, input_dim=X_train_embeddings.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate_1),
        Dense(num_units_2, activation='relu'),
        BatchNormalization(),
        Dropout(dropout_rate_2),
        Dense(len(label_encoder.classes_), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    y_train_cat = to_categorical(y_train)
    history = model.fit(
        X_train_embeddings, y_train_cat,
        epochs=10, batch_size=batch_size,
        validation_split=0.2, verbose=0
    )
    val_loss = min(history.history['val_loss'])
    return val_loss
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
best_params = study.best_params
final_model = Sequential([
    Dense(best_params["num_units_1"], input_dim=X_train_embeddings.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(best_params["dropout_rate_1"]),
    Dense(best_params["num_units_2"], activation='relu'),
    BatchNormalization(),
    Dropout(best_params["dropout_rate_2"]),
    Dense(len(label_encoder.classes_), activation='softmax')
])
final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
y_train_cat = to_categorical(y_train)
final_model.fit(
    X_train_embeddings, y_train_cat,
    epochs=15, batch_size=best_params["batch_size"],
    validation_split=0.2, verbose=1
)
y_test_cat = to_categorical(y_test)
test_loss, test_accuracy = final_model.evaluate(X_test_embeddings, y_test_cat, verbose=1)
print(f"Test Accuracy: {test_accuracy}")
y_pred = np.argmax(final_model.predict(X_test_embeddings), axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

[I 2025-01-09 15:04:31,446] A new study created in memory with name: no-name-bd55a923-6c4f-4b4c-87e1-04e4010df31f
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
[I 2025-01-09 15:04:36,618] Trial 0 finished with value: 0.8289971351623535 and parameters: {'num_units_1': 306, 'num_units_2': 138, 'dropout_rate_1': 0.436852566557132, 'dropout_rate_2': 0.12402948610235735, 'batch_size': 55, 'learning_rate': 0.003611425353625867}. Best is trial 0 with value: 0.8289971351623535.
[I 2025-01-09 15:04:42,958] Trial 1 finished with value: 1.2145863771438599 and parameters: {'num_units_1': 136, 'num_units_2': 122, 'dropout_rate_1': 0.46400510309308146, 'dropout_rate_2': 0.3389597201360252, 'batch_size': 60, 'learning_rate': 0.0064531145065171575}. Best is trial 0 with value: 0.8289971351623535.
[I 2025-01-09 15:04:48,231] Trial 2 finished with value: 0.8717195391654968 and parameters: {'num_units_1': 328, 'num_units_2': 183, 'dropout_rate_1': 0.37659768474311794, 'dropout_r

Epoch 1/15
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.4253 - loss: 1.6015 - val_accuracy: 0.2049 - val_loss: 1.6265
Epoch 2/15
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6805 - loss: 0.8624 - val_accuracy: 0.4912 - val_loss: 1.2820
Epoch 3/15
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7975 - loss: 0.5868 - val_accuracy: 0.5618 - val_loss: 1.0441
Epoch 4/15
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8734 - loss: 0.3983 - val_accuracy: 0.7244 - val_loss: 0.7963
Epoch 5/15
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8672 - loss: 0.3943 - val_accuracy: 0.6890 - val_loss: 0.7861
Epoch 6/15
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8815 - loss: 0.3416 - val_accuracy: 0.7880 - val_loss: 0.6272
Epoch 7/15
[1m71/71[0m [32m━━━━

In [None]:
model.save("best_mal_hyper_classification_model.h5")
print("Model saved as 'best_mal_hyper_classification_model.h5'")



Model saved as 'best_mal_hyper_classification_model.h5'


In [None]:
from tensorflow.keras.models import load_model
import pickle
model = load_model("/content/best_mal_hyper_classification_model.h5")
with open("malayalam_label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)
custom_input = "മതം എന്നാൽ വെല്ലുവിളികൾ ഇല്ലാതെ എല്ലാവരും സമാധാനത്തിനായി ഒരുമിച്ച് പ്രവർത്തിക്കണം" 
custom_input_cleaned = preprocess_tamil_text(custom_input)
custom_input_embedding = extract_embeddings("xlm-roberta-large", [custom_input_cleaned])
predicted_label_index = np.argmax(model.predict(custom_input_embedding), axis=1)[0]
predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]
print(f"Predicted Label: {predicted_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
Predicted Label: R


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
stopwords = [
    "അവൻ", "അവൾ", "അവർ", "ആ", "ആകാം", "ആകുന്നു", "ആകും", "ആകെയുള്ള", "ആകെയുള്ളത്", "ആകെയുള്ളവ", "ആകെയുള്ളവർ",
    "ആകെയുള്ളവൻ", "ആകെയുള്ളവൾ", "ആകെയുള്ളവൾക്ക്", "ആകുള്ളവൾക്ക്‌", "ഇത്", "ഇതിൽ", "ഇതിന്റെ", "ഇതും", "ഇതെല്ലാം",
    "ഇവ", "ഇവയിൽ", "ഇവയുടെ", "ഇവയും", "ഇവയെല്ലാം", "ഇവൻ", "ഇവൾ", "ഇവർ", "ഇവരുടെ", "ഇവരിൽ", "ഇവരെയും", "ഇവരെയെല്ലാം",
    "ഇവരോട്", "ഇവരോടും", "ഇവരോടുള്ള", "ഇവരോടുള്ളത്", "ഇവരോടുള്ളവ", "ഇവരോടുള്ളവർ", "ഇവരോടുള്ളവൻ", "ഇവരോടുള്ളവൾ",
    "ഇവരോടുള്ളവൾക്ക്", "ഇവരോടുള്ളവൾക്ക്‌"
]
def preprocess_Mal_text(text):
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ml"))
    tokens = [token for token in tokens if token not in stopwords]
    processed_text = ' '.join(tokens)
    return processed_text
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
test_file_path = '/content/ML-AT-test.xlsx'  
test_data = pd.read_excel(test_file_path)
test_data['cleaned_transcript'] = test_data['Transcript'].apply(preprocess_Mal_text)
bert_model_name = "xlm-roberta-large"
test_embeddings = extract_embeddings(bert_model_name, test_data['cleaned_transcript'].tolist())
trained_model_path = "/content/best_mal_hyper_classification_model.h5"  
trained_model = load_model(trained_model_path)
label_encoder_path = "/content/malayalam_label_encoder.pkl"  
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
predictions = trained_model.predict(test_embeddings)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
test_data['Predicted_Class'] = predicted_labels
output_file_path = './ml-AT-test_using_xlm-roberta-large-predictions.xlsx'
test_data.to_excel(output_file_path, index=False)
print(f"Predictions saved to: {output_file_path}")

Device: cpu




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Predictions saved to: ./ml-AT-test_using_xlm-roberta-large-predictions.xlsx


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
stopwords = list(sorted(adv.stopwords['telugu']))
def preprocess_Tel_text(text):
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="te"))
    tokens = [token for token in tokens if token not in stopwords]
    processed_text = ' '.join(tokens)
    return processed_text
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
test_file_path = '/content/TE-AT-test.xlsx'  
test_data = pd.read_excel(test_file_path)
test_data['cleaned_transcript'] = test_data['Transcript'].apply(preprocess_Tel_text)
bert_model_name = "xlm-roberta-large"
test_embeddings = extract_embeddings(bert_model_name, test_data['cleaned_transcript'].tolist())
trained_model_path = "/content/best_tel_hyper_classification_model.h5"  
trained_model = load_model(trained_model_path)
label_encoder_path = "/content/tel_label_encoder.pkl"  
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
predictions = trained_model.predict(test_embeddings)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
test_data['Predicted_Class'] = predicted_labels
output_file_path = './tel-AT-test_using_xlm-roberta-large-predictions.xlsx'
test_data.to_excel(output_file_path, index=False)
print(f"Predictions saved to: {output_file_path}")


Device: cpu




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
Predictions saved to: ./tel-AT-test_using_xlm-roberta-large-predictions.xlsx


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
stopwords = list(sorted(adv.stopwords['tamil']))
def preprocess_tamil_text(text):
    tokens = list(indic_tokenize.trivial_tokenize(text, lang="ta"))
    tokens = [token for token in tokens if token not in stopwords]
    processed_text = ' '.join(tokens)
    return processed_text
def extract_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    embeddings = []
    batch_size = 16
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    return np.array(embeddings)
test_file_path = '/content/TA-AT-test.xlsx'  
test_data = pd.read_excel(test_file_path)
test_data['cleaned_transcript'] = test_data['Transcript'].apply(preprocess_tamil_text)
bert_model_name = "xlm-roberta-large"
test_embeddings = extract_embeddings(bert_model_name, test_data['cleaned_transcript'].tolist())
trained_model_path = "/content/best_tamil_hyper_classification_model.h5"  
trained_model = load_model(trained_model_path)
label_encoder_path = "/content/tamil_label_encoder.pkl"  
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)
predictions = trained_model.predict(test_embeddings)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
test_data['Predicted_Class'] = predicted_labels
output_file_path = './TA-AT-test-using_xlm-roberta-large-predictions.xlsx'
test_data.to_excel(output_file_path, index=False)
print(f"Predictions saved to: {output_file_path}")


Device: cpu




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Predictions saved to: ./TA-AT-test-using_xlm-roberta-large-predictions.xlsx
