In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [49]:
df=pd.read_csv('Customer Purchasing Behaviors.csv')
grouped = df.drop(columns=['user_id']).groupby(df['region'].map(lambda x: 'North-East' if x in ['North', 'East'] else 'South-West'))

In [50]:
# Box M test 
df1 = grouped.get_group('North-East')
df2 = grouped.get_group('South-West')

# Calculate covariance matrices
S1 = df1[['age', 'annual_income', 'purchase_amount', 'loyalty_score', 'purchase_frequency']].cov().to_numpy()
S2 = df2[['age', 'annual_income', 'purchase_amount', 'loyalty_score', 'purchase_frequency']].cov().to_numpy()

# Calculate the pooled covariance matrix
n1 = len(df1)
n2 = len(df2)

S_pooled = ((n1-1)*S1 + (n2-1)*S2) / (n1 + n2 - 2)

# Calculate the Box M statistic
M = (n1 + n2 -2 ) * np.log(np.linalg.det(S_pooled)) - (n1 - 1) * np.log(np.linalg.det(S1)) - (n2 - 1) * np.log(np.linalg.det(S2))

# Correction factor
p = 5 # number of variables
k = 2 # number of groups

N = n1 + n2

correction_factor = 1 - ((2*p**2 + 3*p - 1) / (6*(p+1)*(k-1))) * (1/(n1-1) + 1/(n2-1) - 1/(n1+n2-2))
dof = ((p*(p+1))//2) * (k-1)

M = M*correction_factor

# Calculate the critical value
from scipy.stats import chi2
p_value = 1 - chi2.cdf(M, dof)

print(f'Box M statistic: {M}')
print(f'Degrees of freedom: {dof}')
print(f'p-value: {p_value}')

import pingouin as pg

# Box M test
df1['group'] = 'A'
df2['group'] = 'B'

# Combine both dataframes
df_combined = pd.concat([df1, df2], ignore_index=True)
numeric_columns = ['age', 'annual_income', 'purchase_amount', 'loyalty_score', 'purchase_frequency']
# Perform Box's M test on the combined dataframe
result = pg.box_m(data=df_combined, dvs=numeric_columns, group='group')
print(result)
# print attributes of the result object
# print(dir(result))
# print(result.__dict__)


Box M statistic: 100.77601236430358
Degrees of freedom: 15
p-value: 9.325873406851315e-15
           Chi2    df          pval  equal_cov
box  100.776012  15.0  9.297145e-15      False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['group'] = 'A'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['group'] = 'B'


In [51]:
# Likelihood ratio test using Wilks' Lambda
# print(1/np.linalg.det(S1))
# print(S1)
det_S1 = np.linalg.det(S1)
det_S2 = np.linalg.det(S2)
det_S_pooled = np.linalg.det(S_pooled)

Lambda = (det_S1 / det_S_pooled)**((n1 - 1) / 2) * (det_S2 / det_S_pooled)**((n2 - 1) / 2)


# Test statistic
chi2_stat = -2 * np.log(Lambda)

# Degrees of freedom
p = 5  # Number of variables
df = (p * (p + 1) / 2) - 2  # Two groups

# p-value
p_value = 1 - chi2.cdf(chi2_stat, df)

print(f'Wilks\' Lambda: {Lambda}')
print(f'Chi2 statistic: {chi2_stat}')
print(f'Degrees of freedom: {df}')
print(f'p-value: {p_value}')

Wilks' Lambda: 3.499576610159108e-23
Chi2 statistic: 103.41363029242187
Degrees of freedom: 13.0
p-value: 3.3306690738754696e-16


M 103.41363029242166
