<a href="https://colab.research.google.com/github/yelagampragathi/NLP_16/blob/main/CRAZYCATS_ASS_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split


In [None]:
# Sample data
data = {
    'English': [
        "What is your name?", "Where do you live?", "How old are you?",
        "What do you do for a living?", "What is your favorite color?"
    ],
    'Telugu': [
        "నీ పేరు ఏమిటి?", "నువ్వు ఎక్కడ నివసిస్తున్నావు?", "మీరు ఎంత వయస్సు ఉన్నారు?",
        "మీరు జీవించడానికి ఏమి చేస్తున్నారు?", "మీ ఇష్ట రంగు ఏది?"
    ],
    'Hindi': [
        "आपका नाम क्या है?", "आप कहाँ रहते हैं?", "आपकी उम्र क्या है?",
        "आप क्या करते हैं?", "आपका पसंदीदा रंग क्या है?"
    ],
    'Tamil': [
        "உங்கள் பெயர் என்ன?", "நீங்கள் எங்கு வாழ்கிறீர்கள்?", "உங்கள் வயது என்ன?",
        "நீங்கள் என்ன வேலை செய்கிறீர்கள்?", "உங்கள் விருப்பமான நிறம் என்ன?"
    ]
}

df = pd.DataFrame(data)


In [None]:
# Convert all entries to strings and handle NaN values
for lang in df.columns:
    df[lang] = df[lang].astype(str).fillna("unknown")

# Initialize a dictionary to hold tokenizers
tokenizers = {}

# Fit the tokenizer on each language's data
for lang in df.columns:
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df[lang])
    tokenizers[lang] = tokenizer


In [None]:
# Prepare the input and output sequences for training
X_data = []
y_data = []

for lang in df.columns:
    for text in df[lang]:
        X_data.append(text)
        # Assign a label based on the language
        y_data.append(lang)  # Here, the label is the language name

# Convert y_data to a categorical format (you may choose to encode it differently)
y_data_encoded = pd.get_dummies(y_data).values  # One-hot encoding for multi-class


In [None]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data_encoded, test_size=0.2, random_state=42)


In [None]:
# Padding sequences
max_len = 20
X_train_seq = tokenizers['English'].texts_to_sequences(X_train)
X_train_seq = keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_len)

# Prepare test sequences
X_test_seq = tokenizers['English'].texts_to_sequences(X_test)
X_test_seq = keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=max_len)


In [None]:
# Define a simple model architecture
def create_model():
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=len(tokenizers['English'].word_index) + 1, output_dim=128))
    model.add(layers.LSTM(64))
    model.add(layers.Dense(len(y_data_encoded[0]), activation='softmax'))  # Adjust for multiple classes
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
# Create and train the model
model = create_model()
model.fit(X_train_seq, y_train, epochs=5, batch_size=32)



Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.1250 - loss: 1.3938
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step - accuracy: 0.3750 - loss: 1.3724
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.3125 - loss: 1.3530
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.3125 - loss: 1.3351
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.3125 - loss: 1.3182


<keras.src.callbacks.history.History at 0x7a2fd2abdd20>

In [None]:
# Evaluate the model
lstm_accuracy = model.evaluate(X_test_seq, y_test)
print(f'Model Accuracy: {lstm_accuracy[1]}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step - accuracy: 0.0000e+00 - loss: 1.4790
Model Accuracy: 0.0
