# Lipstick Index Analysis: Hypothesis Testing and Machine Learning

In [None]:

# Lipstick Index Analysis: EDA, Hypothesis Test, and ML

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# --- LOAD DATA ---
cosmetic_data = pd.read_csv('customer_shopping_data.csv')
gdp_data = pd.read_csv('API_NY.GDP.MKTP.CD_DS2_en_csv_v2_19294.csv', skiprows=4)
cpi_data = pd.read_csv('TURCPALTT01IXNBM.csv')

# --- CLEAN & TRANSFORM ---
cosmetic_data['invoice_date'] = pd.to_datetime(cosmetic_data['invoice_date'], dayfirst=True)
cosmetic_data = cosmetic_data[cosmetic_data['category'] == 'Cosmetics']
cosmetic_data['Sales'] = cosmetic_data['quantity'] * cosmetic_data['price']

year_cols = [col for col in gdp_data.columns if col.isdigit()]
gdp_turkey = gdp_data[gdp_data['Country Name'] == 'Turkiye'][year_cols].T.reset_index()
gdp_turkey.columns = ['Year', 'GDP']
gdp_turkey['Year'] = gdp_turkey['Year'].astype(int)
gdp_turkey['GDP'] = pd.to_numeric(gdp_turkey['GDP'], errors='coerce')

cpi_data.columns = ['DATE', 'CPI']
cpi_data['DATE'] = pd.to_datetime(cpi_data['DATE'])
cpi_data['Year'] = cpi_data['DATE'].dt.year
monthly_cpi = cpi_data.groupby('Year').mean().reset_index()

# Aggregate cosmetic sales
monthly_sales = cosmetic_data.resample('M', on='invoice_date').sum()['Sales'].reset_index()
monthly_sales['Year'] = monthly_sales['invoice_date'].dt.year

# --- MERGE ---
df = monthly_sales.merge(gdp_turkey, on='Year', how='left')
df = df.merge(monthly_cpi, on='Year', how='left')
df = df.dropna(subset=['GDP', 'CPI'])

# --- EXPLORATORY DATA ANALYSIS ---
plt.figure(figsize=(12, 5))
plt.plot(df['invoice_date'], df['Sales'])
plt.title('Cosmetic Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid()
plt.show()

plt.figure(figsize=(12, 5))
plt.plot(df['Year'], df['GDP'])
plt.title('Turkey GDP Over Time')
plt.xlabel('Year')
plt.ylabel('GDP')
plt.grid()
plt.show()

plt.figure(figsize=(12, 5))
plt.plot(df['Year'], df['CPI'])
plt.title('Turkey CPI Over Time')
plt.xlabel('Year')
plt.ylabel('CPI')
plt.grid()
plt.show()

# Correlation Matrix
corr_matrix = df[['Sales', 'GDP', 'CPI']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# --- HYPOTHESIS TESTING ---
corr, p_val = pearsonr(df['GDP'], df['Sales'])
print(f"Correlation between GDP and Sales: {corr:.3f}")
print(f"P-value: {p_val:.4f}")

if p_val < 0.05:
    print("Statistically significant relationship")
else:
    print("No statistically significant relationship")

# --- ML METHODS ---
df['CPI_scaled'] = (df['CPI'] - df['CPI'].mean()) / df['CPI'].std()
df['GDP_scaled'] = (df['GDP'] - df['GDP'].mean()) / df['GDP'].std()

X = df[['GDP_scaled', 'CPI_scaled']]
y = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"\nModel RMSE: {rmse:.2f}")
print(f"Model R^2 Score: {r2:.2f}")

# --- INSIGHTS ---
# EDA shows an overall increase in cosmetic sales with economic fluctuation.
# Correlation test: weak-to-moderate negative correlation between GDP and Sales (r = -0.353)
# But not statistically significant (p = 0.071)
# ML regression: poor fit (R^2 = -0.14), high RMSE (~19594)
# Indicates GDP and CPI alone are not good predictors of cosmetic sales.


## Summary of Results

- Correlation (GDP vs Sales): -0.353
- P-value: 0.071 → Not statistically significant
- RMSE: ~19,594
- R² Score: -0.14

GDP and CPI alone are weak predictors of cosmetic sales.
