In [None]:
import pandas as pd
from langdetect import detect, LangDetectException
from googletrans import Translator
import numpy as np
import re
import nltk

1. **For each review**:

   * Detect language using `langdetect`.
   * If detected language is not English → translate to English using Google Translate.

2. **Clean text**:

   * Remove numbers.
   * Remove **non-alphabetic characters** (keep only letters and spaces).
   * Convert text to **lowercase**.
   * Tokenize text into individual words.
   * Remove **stopwords** (common irrelevant words like “the”, “is”, “and”).
   * Apply **lemmatization** (reduce words to base form, e.g., “running” → “run”).

3. **Handle missing values**:

   * For numeric columns → fill with column mean.
   * For non-numeric columns → fill with column mode.

4. **Remove duplicate rows**.

In [None]:
# Download NLTK data (run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

translator = Translator()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def detect_and_translate(text):
    text = text.strip()
    if not text:
        return text  

    try:
        lang = detect(text)
        print(f"Detected language: {lang} for text: {text[:30]}...")

        if lang != 'en':
            translated = translator.translate(text, dest='en')
            print(f"Translated text: {translated.text[:30]}...")
            return translated.text
        else:
            return text

    except LangDetectException:
        return text
    except Exception as e:
        print(f"Error: {e}. Text: {text}")
        return text

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return text

    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [None]:
def clean_data(file_path):
    df = pd.read_csv(file_path)

    df['Review'] = df['Review'].apply(detect_and_translate)

    df['Review'] = df['Review'].apply(clean_text)

    for column in df.columns:
        if np.issubdtype(df[column].dtype, np.number):
            df[column] = df[column].fillna(df[column].mean())
        else:
            df = df[df[column].notna()]

    df = df.drop_duplicates()

    return df

In [None]:
file_path = '../data/products-unclean.csv'
cleaned_df = clean_data(file_path)


cleaned_df.to_csv('../data/products.csv', index=False)
print("Data cleaning complete. Saved to '../dat/products.csv'.")