In [None]:
# pip install pandas textblob openpyxl deep-translator
# python -m textblob.download_corpora

In [20]:
import pandas as pd
from textblob import TextBlob
import numpy as np
import sys
from deep_translator import GoogleTranslator

In [25]:
# --- CONFIGURATION ---
# IMPORTANT: Use the exact name of your original uploaded file here.
file_path_in = "KBZ_Pay.xlsx"
file_path_out = "KBZ_Pay_Translated_with_Sentiment.csv"

In [29]:
# --- 1. ROBUST DATA LOADING (Fixing Tokenization Error) ---
print(f"Attempting to load data from: {file_path_in}")
try:
    # We add on_bad_lines='skip' to ignore rows with incorrect number of columns
    # We also use the 'python' engine for better parsing logic
    df = pd.read_excel(file_path_in)
    print("-> Successfully loaded with 'utf-8-sig' and skipped bad lines.")
except UnicodeDecodeError:
    try:
        # Fallback to Latin-1 + skip bad lines
        df = pd.read_excel(file_path_in)
        print("-> Successfully loaded with 'latin-1' and skipped bad lines.")
    except Exception as e:
        print(f"!!! ERROR: Final failure to load the file. Details: {e}")
        sys.exit()

Attempting to load data from: KBZ_Pay.xlsx
-> Successfully loaded with 'utf-8-sig' and skipped bad lines.


In [30]:
# --- 2. DATA PREPARATION ---
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df = df.dropna(subset=['Rating', 'Review'])

# Generate the 'Is_Burmese' flag again, as we will use it to target translation
def is_burmese(text):
    if pd.isna(text): return 0
    return 1 if any('\u1000' <= char <= '\u109F' for char in str(text)) else 0
df['Is_Burmese'] = df['Review'].apply(is_burmese)

In [35]:
# --- 3. TRANSLATION AND SENTIMENT FUNCTIONS ---

# Function to translate text only if it's Burmese
def translate_burmese_reviews(review_series, is_burmese_series):
    translated_reviews = []
    
    # We only translate the rows where the review is flagged as Burmese (1)
    burmese_reviews_to_translate = review_series[is_burmese_series == 1]
    
    print(f"\nFound {len(burmese_reviews_to_translate)} Burmese reviews to translate...")
    
    # --- MT Execution ---
    # The translator is initialized outside the loop for efficiency
    translator = GoogleTranslator(source='my', target='en')
    
    # Use the translator to batch-translate the Burmese reviews
    translated_texts = []
    
    # Due to API rate limits, large translations often need to be batched or carefully managed.
    # For a robust script, we translate sequentially or in small batches.
    
    # For simplicity here, we apply it directly to the series for sequential translation:
    def safe_translate(text):
        try:
            return translator.translate(text)
        except Exception as e:
            # Print error and return original text if translation fails (e.g., API limits)
            print(f"Translation failed for a review. Error: {e}")
            return text 

    translated_burmese = burmese_reviews_to_translate.apply(safe_translate)
    
    # Map the translated results back into the original DataFrame, using the original English text 
    # for the reviews that were not Burmese (Is_Burmese == 0).
    df['Translated_Review'] = df['Review']
    df.loc[df['Is_Burmese'] == 1, 'Translated_Review'] = translated_burmese
    
    # Drop rows where translation resulted in NaT/None (optional cleanup)
    df.dropna(subset=['Translated_Review'], inplace=True)
    
    return df

# Function to calculate sentiment on the translated text
def get_sentiment_polarity(text):
    if pd.isna(text) or text is None:
        return np.nan
    try:
        analysis = TextBlob(str(text))
        return analysis.sentiment.polarity
    except Exception:
        return np.nan

In [36]:
# --- 4. EXECUTION ---
df = translate_burmese_reviews(df['Review'], df['Is_Burmese'])

print("Calculating Sentiment Score on Translated/English Text...")
df['Sentiment_Score_Translated'] = df['Translated_Review'].apply(get_sentiment_polarity)

# --- 5. SAVE THE FINAL UPDATED DATASET (Fixes the "?" encoding issue) ---
# We use the recommended 'utf-8-sig' for maximum compatibility with Burmese text and Excel.
df.to_csv(file_path_out, index=False, encoding='utf-8-sig')

print(f"\n--- SUCCESS! ---")
print(f"The new file, '{file_path_out}', has been created.")
print("This file contains the new 'Translated_Review' and the accurate 'Sentiment_Score_Translated' columns.")


Found 156 Burmese reviews to translate...
Calculating Sentiment Score on Translated/English Text...

--- SUCCESS! ---
The new file, 'KBZ_Pay_Translated_with_Sentiment.csv', has been created.
This file contains the new 'Translated_Review' and the accurate 'Sentiment_Score_Translated' columns.
