In [2]:
import pandas as pd
import numpy as np
import sys

In [5]:
# --- CONFIGURATION ---
# Use the file that contains the translation and the translated sentiment score
file_path = "KBZ_Pay_Translated_with_Sentiment.csv" 
target_variable = 'Rating'

In [6]:
# --- Step 1: Load the Cleaned and Translated Data ---
print(f"Loading data from: {file_path}")
try:
    # Assuming the file was correctly saved with UTF-8-SIG in the last step
    df = pd.read_csv(file_path, encoding='utf-8-sig') 
except Exception as e:
    print(f"!!! ERROR: Failed to load the file. Check file name and path. Details: {e}")
    sys.exit()

Loading data from: KBZ_Pay_Translated_with_Sentiment.csv


In [7]:
# Initial Data Cleanup and Preparation
df = df.dropna(subset=[target_variable, 'Sentiment_Score_Translated'])
df[target_variable] = pd.to_numeric(df[target_variable], errors='coerce')

In [8]:
# --- Step 2: Feature Engineering for Time (Date -> Days_Since_Start) ---
print("Feature Engineering: Converting Date to continuous 'Days_Since_Start'...")
try:
    df['Date'] = pd.to_datetime(df['Date'])
    # Calculate a continuous variable: Days Since Start of the review period
    df['Days_Since_Start'] = (df['Date'] - df['Date'].min()).dt.days
except Exception as e:
    print(f"Error processing 'Date' column: {e}. Skipping this variable.")
    # Set to NaN so it doesn't break later steps if date parsing fails
    df['Days_Since_Start'] = np.nan

Feature Engineering: Converting Date to continuous 'Days_Since_Start'...


In [9]:
# --- Step 3: Feature Engineering for Device Type (One-Hot Encoding) ---
print("Feature Engineering: Applying One-Hot Encoding to 'Device Type'...")
if 'Device Type' in df.columns:
    # One-Hot Encoding (OHE) for the categorical variable 'Device Type'
    device_dummies = pd.get_dummies(df['Device Type'], prefix='Device', drop_first=False)
    df = pd.concat([df, device_dummies], axis=1)
    
    # Drop the original categorical column
    df = df.drop(columns=['Device Type'])
else:
    print("Warning: 'Device Type' column not found.")

Feature Engineering: Applying One-Hot Encoding to 'Device Type'...


In [10]:
# --- Step 4: Identify Final Variables for Correlation ---
# Collect the names of all continuous and dummy independent variables

independent_vars = [
    'Sentiment_Score_Translated',
    'Days_Since_Start',
]

# Add the 'Language' variable (Is_Burmese) if it exists from the translation step
if 'Is_Burmese' in df.columns:
    independent_vars.append('Is_Burmese')
else:
    # If translation was skipped, we check if the original Is_Burmese was present
    print("Warning: 'Is_Burmese' column not found. Language variable omitted.")

# Dynamically add all OHE columns for Device Type
independent_vars.extend([col for col in df.columns if col.startswith('Device_')])

In [11]:
# --- Step 5: Calculate the Pearson Correlation Matrix ---
# Filter DataFrame to include only the target and independent variables for correlation
correlation_data = df[[target_variable] + independent_vars].dropna()
correlation_matrix = correlation_data.corr(method='pearson')

# --- Step 6: Extract and Present Results ---
# Extract the correlation coefficients between Rating and all independent variables
pearson_results = correlation_matrix.loc[target_variable, independent_vars].sort_values(ascending=False)

print("\n" + "="*50)
print("        FINAL PEARSON CORRELATION RESULTS (r)")
print(f"           Target Variable: {target_variable}")
print("="*50)
# Print results in a neat, structured format
print(pearson_results.to_string(header=True, float_format="{:.4f}".format))
print("="*50)


        FINAL PEARSON CORRELATION RESULTS (r)
           Target Variable: Rating
Sentiment_Score_Translated    0.4166
Device_Tablet                 0.0032
Device_Phone                 -0.0032
Days_Since_Start             -0.0252
Is_Burmese                   -0.2390


In [12]:
# Optional: Save the fully processed data for your records
df.to_csv('KBZ_Pay_Fully_Processed_Data.csv', index=False, encoding='utf-8-sig')
print("\nFully processed dataset saved to: KBZ_Pay_Fully_Processed_Data.csv")


Fully processed dataset saved to: KBZ_Pay_Fully_Processed_Data.csv
