In [2]:
import os
from typing import Dict, List

import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import chi2_contingency

In [None]:
# read data
data_train = pd.read_csv(os.path.join("data", "raw", "train.csv"))
data_test = pd.read_csv(os.path.join("data", "raw", "test.csv"))
data = pd.concat([data_train, data_test], sort=False)
display(data.head())
display(data.info())

In [None]:
# non categorical columns and not necessary columns for analysis are dropped
columns = data.columns
categorical_columns = columns.drop(
    [
        "issue_category_sub_category",
        "agent_experience_level_desc",
        "conversation",
        "issue_sub_category",
    ]
)
print(f"Categorcal Colmuns are: {categorical_columns.values}")

In [None]:
# visualization of categorical columns
for col in categorical_columns:
    data[col].value_counts().plot(kind="bar")
    plt.title(f"Count plot for {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [9]:
def perform_chi_square_tests(
    df: pd.DataFrame, target_col: str, categorical_cols: List[str]
) -> pd.DataFrame:
    """
    Performs Chi-Square test of independence between each categorical column and the target column.

    Args:
        df (pd.DataFrame): Input DataFrame containing categorical columns.
        target_col (str): Name of the target categorical column.
        categorical_cols (List[str]): List of categorical columns to test against the target.

    Returns:
        pd.DataFrame: DataFrame containing p-values and chi-square statistics for each feature.
    """
    results: Dict[str, Dict[str, float]] = {}

    for col in categorical_cols:
        if target_col == col:
            continue
        contingency_table = pd.crosstab(df[col], df[target_col])
        chi2, p, dof, _ = chi2_contingency(contingency_table)
        results[col] = {
            "p_value": p,
            "is_rejected": p < 0.05,
        }

    return pd.DataFrame(results).T.sort_values(by="p_value")

In [None]:
chi_results = perform_chi_square_tests(
    data, target_col="customer_sentiment", categorical_cols=categorical_columns
)
display(chi_results)

In [None]:
# comparing customer_sentiment with other columns
for col in categorical_columns:
    if col != "customer_sentiment":
        data.groupby([col, "customer_sentiment"]).size().unstack().plot(
            kind="bar", stacked=True
        )
        plt.title(f"Count plot for {col} and customer_sentiment")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.show()

# Comments
Here, we see that agent_experience_level, issue_category and issue_sub_category are significant columns for customer_sentiment 
- For instance, Accessing Warrant Details, Book Pricing Discrepancy, Cash On Delivery Refunds, etc. has only neutral sentiment. Likewise, Pickup and Shipping,Product information and tags categories have only positive sentiment. So, we can say that these columns are significant for customer_sentiment (p<0.5)
- We can infer that positive comments are associated with the "order" issue area
- We also can conclude from the distribution of customer_sentiment that the data is imbalanced. We have more neutral and negative sentiments than positive sentiments. So, we must be careful while training the model and not to overfit the model to the majority class.