In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from scipy.stats import shapiro, chi2
from scipy.spatial.distance import mahalanobis

In [2]:
df=pd.read_csv('Customer Purchasing Behaviors.csv')
grouped = df.drop(columns=['user_id']).groupby(df['region'].map(lambda x: 'North-East' if x in ['North', 'East'] else 'South-West'))

In [3]:
df1 = grouped.get_group('North-East')
df2 = grouped.get_group('South-West')

numeric_columns = ['age', 'annual_income', 'purchase_amount', 'loyalty_score', 'purchase_frequency']

mean1 = df1[numeric_columns].mean()
mean2 = df2[numeric_columns].mean()

print("Mean values for North-East region:", mean1)
print("Mean values for South-West region:", mean2)
# Perform Welch's t-test for each numeric attribute
results = {}
for column in numeric_columns:
    # Calculate Welch's t-test
    t_stat, p_value = stats.ttest_ind(df1[column], df2[column], equal_var=False)
    
    # Store results
    results[column] = {
        't-statistic': t_stat,
        'p-value': p_value
    }

# Print results
for column, result in results.items():
    print(f"Attribute: {column}")
    print(f"  t-statistic: {result['t-statistic']:.4f}")
    print(f"  p-value: {result['p-value']:.4f}")
    if result['p-value'] < 0.05:
        print("  Result: Reject the null hypothesis (means are significantly different)\n")
    else:
        print("  Result: Fail to reject the null hypothesis (means are not significantly different)\n")

Mean values for North-East region: age                      34.166667
annual_income         52750.000000
purchase_amount         355.833333
loyalty_score             5.832143
purchase_frequency       17.583333
dtype: float64
Mean values for South-West region: age                      41.136364
annual_income         59948.051948
purchase_amount         463.701299
loyalty_score             7.318831
purchase_frequency       21.006494
dtype: float64
Attribute: age
  t-statistic: -6.0904
  p-value: 0.0000
  Result: Reject the null hypothesis (means are significantly different)

Attribute: annual_income
  t-statistic: -5.1589
  p-value: 0.0000
  Result: Reject the null hypothesis (means are significantly different)

Attribute: purchase_amount
  t-statistic: -6.2384
  p-value: 0.0000
  Result: Reject the null hypothesis (means are significantly different)

Attribute: loyalty_score
  t-statistic: -6.2901
  p-value: 0.0000
  Result: Reject the null hypothesis (means are significantly different)

In [5]:
import numpy as np
from scipy.stats import f

# Assuming df1 and df2 are your two dataframes with the same numeric attributes
# Extract the numeric columns
numeric_columns = ['age', 'annual_income', 'purchase_amount', 'loyalty_score', 'purchase_frequency']
X1 = df1[numeric_columns].to_numpy()
X2 = df2[numeric_columns].to_numpy()

# Calculate means and covariance matrices
mean1 = np.mean(X1, axis=0)
mean2 = np.mean(X2, axis=0)
S1 = np.cov(X1, rowvar=False)
S2 = np.cov(X2, rowvar=False)
n1, n2 = X1.shape[0], X2.shape[0]
p = X1.shape[1]

# Calculate the difference in means and pooled covariance matrix
mean_diff = mean1 - mean2
pooled_cov = S1 / n1 + S2 / n2

# Invert pooled covariance, handling potential singular matrix
try:
    inv_pooled_cov = np.linalg.inv(pooled_cov)
except np.linalg.LinAlgError:
    inv_pooled_cov = np.linalg.pinv(pooled_cov)  # Use pseudo-inverse if singular

# Calculate James's Test statistic
J = mean_diff.T @ inv_pooled_cov @ mean_diff

# Degrees of freedom for F-distribution approximation
df_num = p  # Numerator degrees of freedom
df_den = ((n1 + n2 - 2) * (p + 1)) / (2 * (n1 + n2 - p - 1))  # Denominator degrees of freedom

# Calculate critical F-value for a 95% confidence level
F_critical = f.ppf(0.95, df_num, df_den)
p_value = 1 - f.cdf(J, df_num, df_den)

# Print results
print("James's Test Statistic:", J)
print("Critical F-value:", F_critical)
print("p-value:", p_value)
print("Result:", "Reject null hypothesis" if J > F_critical else "Fail to reject null hypothesis")


James's Test Statistic: 78.57581647093185
Critical F-value: 8.7901260689275
p-value: 0.0020406128044438976
Result: Reject null hypothesis
