Chi-square and Hypothesis testing

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, ttest_1samp, norm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
df = pd.read_csv("/content/sample_data/sales_data_with_discounts.csv")

In [3]:
print("Dataset loaded. Shape:", df.shape)
print("\nColumns:", list(df.columns))

Dataset loaded. Shape: (450, 13)

Columns: ['Date', 'Day', 'SKU', 'City', 'Volume', 'BU', 'Brand', 'Model', 'Avg Price', 'Total Sales Value', 'Discount Rate (%)', 'Discount Amount', 'Net Sales Value']


1. CHI-SQUARE TEST FIRST
Test: Association between BU (Business Unit) and City

In [4]:
contingency_table = pd.crosstab(df['BU'], df['City'])
print("\n=== CHI-SQUARE TEST: BU vs City ===")
print("Contingency table:")
print(contingency_table)


=== CHI-SQUARE TEST: BU vs City ===
Contingency table:
City         C
BU            
FMCG       150
Lifestyle  150
Mobiles    150


In [5]:
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

In [6]:
print(f"\nChi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")


Chi-square statistic: 0.0000
P-value: 1.0000
Degrees of freedom: 0


In [7]:
alpha = 0.05
if p_value < alpha:
    print("RESULT: Reject H0 - Significant association between BU and City")
else:
    print("RESULT: Fail to reject H0 - No significant association")

RESULT: Fail to reject H0 - No significant association


2. HYPOTHESIS TESTING
Test if average discount rate > 10%(one-sample t-test)

In [8]:
discounts = df['Discount Rate (%)'].dropna()
sample_mean = discounts.mean()
sample_std = discounts.std()
n = len(discounts)

In [9]:
print("\n=== HYPOTHESIS TEST: Avg Discount Rate > 10% ===")
print(f"Sample size: {n}")
print(f"Sample mean discount: {sample_mean:.2f}%")
print(f"Sample std dev: {sample_std:.2f}%")


=== HYPOTHESIS TEST: Avg Discount Rate > 10% ===
Sample size: 450
Sample mean discount: 15.16%
Sample std dev: 4.22%


In [10]:
# H0: mu <= 10%,H1: mu > 10%(right-tailed)
t_stat, p_value_t = ttest_1samp(discounts, 10, alternative='greater')

In [11]:
print(f"\nT-statistic: {t_stat:.4f}")
print(f"P-value (right-tailed): {p_value_t:.4f}")


T-statistic: 25.9108
P-value (right-tailed): 0.0000


In [12]:
if p_value_t < alpha:
    print("RESULT: Reject H0 - Average discount rate is significantly > 10%")
else:
    print("RESULT: Fail to reject H0 - Not enough evidence discount > 10%")

RESULT: Reject H0 - Average discount rate is significantly > 10%


In [13]:
# Bonus: Quick summary stats
print("\n DATASET SUMMARY ")
print(df.groupby('BU')['Discount Rate (%)'].agg(['mean', 'count']).round(2))


 DATASET SUMMARY 
            mean  count
BU                     
FMCG       17.43    150
Lifestyle  17.47    150
Mobiles    10.56    150
