In [4]:
import pandas as pd
import numpy as np
import re
from math import ceil
import sys
from textblob import TextBlob

In [5]:
# Define the Unicode ranges for common emojis
EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F700-\U0001F77F"
    "\U0001F780-\U0001F7FF"
    "\U0001F800-\U0001F8FF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FA6F"
    "\U0001FA70-\U0001FAFF"
    "\U00002702-\U000027B0"
    "\U00002600-\U000026FF"
    "]+", flags=re.UNICODE
)

In [6]:
# --- Feature Engineering Functions ---

def count_emojis(text):
    """Counts the number of emojis in a text string."""
    if pd.isna(text) or text is None: return 0
    return len(EMOJI_PATTERN.findall(str(text)))

def calculate_month_diff(date, start_date):
    """Calculates the total number of months between two datetime objects."""
    return (date.year - start_date.year) * 12 + (date.month - start_date.month)

def get_language_type(text):
    """Categorizes review text into Burmese, English, or Mixed."""
    text = str(text)
    # Check for Burmese characters (Unicode range U+1000 to U+109F)
    is_burmese_present = any('\u1000' <= char <= '\u109F' for char in text)
    # Check for English characters (ASCII letters a-z, A-Z)
    is_english_present = any('a' <= char <= 'z' or 'A' <= char <= 'Z' for char in text)

    if is_burmese_present and is_english_present:
        return 'Mixed'
    elif is_burmese_present:
        return 'Burmese'
    elif is_english_present:
        return 'English'
    else:
        # Punctuation/numbers onlyâ€”default to English as it often represents short English reviews
        return 'English'

# --- Start of the actual analysis code ---
file_path = "KBZ_Pay_Translated_with_Sentiment.csv"
target_variable = 'Rating'
sentiment_variable = 'Sentiment_Score_Translated'

In [7]:
# Step 1: Load Data
try:
    # Use robust loading parameters for the user's uploaded CSV
    df = pd.read_csv(file_path, encoding='utf-8-sig', on_bad_lines='skip', engine='python')
except Exception as e:
    # Handle error if file fails to load
    sys.exit()

In [8]:
# Data Preparation
df = df.dropna(subset=[target_variable, 'Review', sentiment_variable])
df[target_variable] = pd.to_numeric(df[target_variable], errors='coerce')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date', 'Review'])

In [9]:
# --- FEATURE ENGINEERING ---

# 1. Language Categorization and OHE (For Lang_Burmese, Lang_English, Lang_Mixed)
df['Language_Type'] = df['Review'].apply(get_language_type)
language_dummies = pd.get_dummies(df['Language_Type'], prefix='Lang', drop_first=False)
df = pd.concat([df, language_dummies], axis=1).drop(columns=['Language_Type'])

In [10]:
# 2. 2-Month Interval (Ordinal Time Feature)
start_date = df['Date'].min()
df['Month_Difference'] = df['Date'].apply(lambda x: calculate_month_diff(x, start_date))
df['Date_Interval'] = (df['Month_Difference'] / 2).apply(ceil).astype(int)

# 3. Emoji Count (Continuous Feature)
df['Emoji_Count'] = df['Review'].apply(count_emojis)

In [11]:
# 4. Device Type OHE
if 'Device Type' in df.columns:
    device_dummies = pd.get_dummies(df['Device Type'], prefix='Device', drop_first=False)
    df = pd.concat([df, device_dummies], axis=1).drop(columns=['Device Type'])

In [12]:
# --- Step 4: Identify Final Variables for Correlation ---
independent_vars = [
    sentiment_variable,
    'Date_Interval',
    'Emoji_Count',
]
# Dynamically add all OHE columns
independent_vars.extend([col for col in df.columns if col.startswith('Lang_')])
independent_vars.extend([col for col in df.columns if col.startswith('Device_')])

In [13]:
# --- Step 5: Calculate the Pearson Correlation Matrix ---
correlation_data = df[[target_variable] + independent_vars].dropna()
correlation_matrix = correlation_data.corr(method='pearson')

# --- Step 6: Extract and Present Results ---
pearson_results = correlation_matrix.loc[target_variable, independent_vars].sort_values(ascending=False)

In [14]:
output_file = 'KBZ_Pay_Final_Correlation_Data_Ver2.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')

In [15]:
# This print statement is optimized for clean, formatted output in a notebook cell
print("\n" + "="*50)
print("        FINAL PEARSON CORRELATION RESULTS (r)")
print("="*50)
print(pearson_results.to_string(header=True, float_format="{:.4f}".format))
print("="*50)


        FINAL PEARSON CORRELATION RESULTS (r)
Sentiment_Score_Translated    0.4166
Lang_English                  0.2390
Emoji_Count                   0.0147
Device_Tablet                 0.0032
Device_Phone                 -0.0032
Date_Interval                -0.0330
Lang_Burmese                 -0.0730
Lang_Mixed                   -0.2811
