<a href="https://colab.research.google.com/github/yelagampragathi/NLP_16/blob/main/CRAZYCATS_ASS_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset

In [None]:
dataset_path = '/content/Indian language community chatbot.csv'  # Update this path with the actual path of your transcribed dataset

# Read CSV with delimiter issues handled

In [None]:
df = pd.read_csv(dataset_path, delimiter='|', skipinitialspace=True)

# Clean up column names

In [None]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('|', '').str.strip()

# Display cleaned column names

In [None]:
print("Cleaned columns:", df.columns)

Cleaned columns: Index(['Unnamed: 0', 'ID', 'English Text', 'Telugu Answer', 'Hindi Answer',
       'Tamil Answer', 'Malayalam Answer', ',,,,,,,,'],
      dtype='object')


# Fill missing values (if any) with an empty string or appropriate value

In [None]:
df.fillna('', inplace=True)

# Basic text cleaning function

In [None]:
def clean_text(text):
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Apply text cleaning to relevant columns

In [None]:
df['English Text'] = df['English Text'].apply(clean_text)
df['Telugu Answer'] = df['Telugu Answer'].apply(clean_text)
df['Hindi Answer'] = df['Hindi Answer'].apply(clean_text)
df['Tamil Answer'] = df['Tamil Answer'].apply(clean_text)
df['Malayalam Answer'] = df['Malayalam Answer'].apply(clean_text)

# Function to get translations

In [None]:
def get_translations(question):
    row = df[df['English Text'] == question]
    if not row.empty:
        return {
            'Telugu': row['Telugu Answer'].values[0],
            'Hindi': row['Hindi Answer'].values[0],
            'Tamil': row['Tamil Answer'].values[0],
            'Malayalam': row['Malayalam Answer'].values[0]
        }
    else:
        return 'Question not found'

# Example usage: Get translations

In [None]:
question = 'What is your name?'
translations = get_translations(question)
print(f"Translations for '{question}': {translations}")

Translations for 'What is your name?': {'Telugu': 'నా పేరు రామ్.', 'Hindi': 'मेरा नाम राम है।', 'Tamil': 'என் பெயர் ராம்.', 'Malayalam': 'എന്റെ പേര് രാം.'}


# Implementing text processing tasks as per assignment

# 1. Tokenization

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize

# Tokenize example text

In [None]:
example_text = df['English Text'].iloc[0]  # Get an example text from the dataset
tokens = word_tokenize(example_text)
print(f"Tokens: {tokens}")

Tokens: ['What', 'is', 'your', 'name', '?']


# 2. Stemming
# Stem words from an example text

In [None]:
example_text = df['English Text'].iloc[0]  # Get example text from the dataset
tokens = word_tokenize(example_text)  # Tokenize the example text
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in tokens]  # Apply stemming to the tokens
print(f"Stems: {stems}")

Stems: ['what', 'is', 'your', 'name', '?']


# 3. Word Representation
# Represent words in the English Text column

In [None]:
texts = df['English Text'].tolist()  # Convert the 'English Text' column to a list of texts
vectorizer = CountVectorizer()  # Initialize CountVectorizer
word_vectors = vectorizer.fit_transform(texts).toarray()  # Fit and transform the texts
feature_names = vectorizer.get_feature_names_out()

# Display the results

In [None]:
print("Sample of cleaned English Text data:")
print(df['English Text'].head())
print("\nSample of cleaned Telugu Answer data:")
print(df['Telugu Answer'].head())
print("\nSample of cleaned Hindi Answer data:")
print(df['Hindi Answer'].head())
print("\nSample of cleaned Tamil Answer data:")
print(df['Tamil Answer'].head())
print("\nSample of cleaned Malayalam Answer data:")
print(df['Malayalam Answer'].head())


Sample of cleaned English Text data:
0              What is your name?
1              Where do you live?
2                How old are you?
3    What do you do for a living?
4    What is your favorite color?
Name: English Text, dtype: object

Sample of cleaned Telugu Answer data:
0                          నా పేరు రామ్.
1              నేను హైదరాబాద్‌లో ఉంటాను.
2            నేను 25 సంవత్సరాలు ఉన్నాను.
3    నేను ఒక ఇంజనీర్‌గానుగా పనిచేస్తాను.
4                 నా ప్రియమైన రంగు నీలం.
Name: Telugu Answer, dtype: object

Sample of cleaned Hindi Answer data:
0              मेरा नाम राम है।
1    मैं हैदराबाद में रहता हूँ।
2            मैं 25 साल का हूँ।
3          मैं एक इंजीनियर हूँ।
4     मेरा पसंदीदा रंग नीला है।
Name: Hindi Answer, dtype: object

Sample of cleaned Tamil Answer data:
0                    என் பெயர் ராம்.
1    நான் ஹைதராபாத்தில் வசிக்கிறேன்.
2                      நான் 25 வயது.
3                நான் ஒரு பொறியாளர்.
4        எனக்கு பிடித்த நிறம் நீலம்.
Name: Tamil Answer, dtype

# Example: Word Representation using the 'English Text' column

# Initialize CountVectorizer with additional parameters if needed

In [None]:
texts = df['English Text'].tolist()  # Extract English Text for vectorization
vectorizer = CountVectorizer()
word_vectors = vectorizer.fit_transform(texts).toarray()
feature_names = vectorizer.get_feature_names_out()
print(f"Word Vectors:\n{word_vectors}")
print(f"Feature Names:\n{feature_names}")

Word Vectors:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
Feature Names:
['activity' 'adventures' 'animal' 'animals' 'any' 'app' 'are' 'beach'
 'best' 'book' 'books' 'can' 'car' 'city' 'coffee' 'color' 'cook'
 'cooking' 'countryside' 'cream' 'cuisine' 'dancing' 'day' 'dessert'
 'destination' 'did' 'do' 'dream' 'drink' 'drive' 'drives' 'english'
 'enjoy' 'exercise' 'favorite' 'festival' 'flower' 'food' 'for' 'friend'
 'fruit' 'gadget' 'games' 'gardening' 'genre' 'going' 'have' 'hobby'
 'holiday' 'hometown' 'how' 'ice' 'in' 'indoor' 'instruments' 'is' 'job'
 'languages' 'like' 'listening' 'live' 'living' 'long' 'married' 'media'
 'meditate' 'morning' 'movie' 'movies' 'music' 'name' 'of' 'old' 'or'
 'out' 'outdoor' 'outfit' 'painting' 'person' 'pet' 'pets' 'photography'
 'picnics' 'place' 'platform' 'play' 'playing' 'prefer' 'profession'
 'reading' 'regularly' 'season' 'selfies' 'shows' 'siblings' 'snack'
 'snacks

# Sentence Representation (example sentences for illustration)

In [None]:
sentence_vectors = vectorizer.transform(texts).toarray()  # Use transform instead of fit_transform to use the same vectorizer
print(f"Sentence Vectors:\n{sentence_vectors}")
print(f"Feature Names:\n{feature_names}")

Sentence Vectors:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Feature Names:
['activity' 'adventures' 'animal' 'animals' 'any' 'app' 'are' 'beach'
 'best' 'book' 'books' 'can' 'car' 'city' 'coffee' 'color' 'cook'
 'cooking' 'countryside' 'cream' 'cuisine' 'dancing' 'day' 'dessert'
 'destination' 'did' 'do' 'dream' 'drink' 'drive' 'drives' 'english'
 'enjoy' 'exercise' 'favorite' 'festival' 'flower' 'food' 'for' 'friend'
 'fruit' 'gadget' 'games' 'gardening' 'genre' 'going' 'have' 'hobby'
 'holiday' 'hometown' 'how' 'ice' 'in' 'indoor' 'instruments' 'is' 'job'
 'languages' 'like' 'listening' 'live' 'living' 'long' 'married' 'media'
 'meditate' 'morning' 'movie' 'movies' 'music' 'name' 'of' 'old' 'or'
 'out' 'outdoor' 'outfit' 'painting' 'person' 'pet' 'pets' 'photography'
 'picnics' 'place' 'platform' 'play' 'playing' 'prefer' 'profession'
 'reading' 'regularly' 'season' 'selfies' 'shows' 'siblings' 'snack'
 'sn

# Save the cleaned dataset

In [None]:
cleaned_dataset_path = '/content/Indian language community chatbot.csv'  # Update this path as needed
df.to_csv(cleaned_dataset_path, index=False)
print(f"Cleaned dataset saved to {cleaned_dataset_path}")

Cleaned dataset saved to /content/Indian language community chatbot.csv
