# Store Performance Trends Analysis (Absolute & Relative)

This notebook analyzes store performance using:
1.  **Absolute Metrics** (e.g., Sales per GLA): To gauge actual volume and efficiency.
2.  **Relative Indices** (Store / Mall Average): To gauge outperformance relative to the specific mall context.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import statsmodels.formula.api as smf

# Set default plotly template
pio.templates.default = "plotly_white"

## 1. Load and Prepare Data

In [2]:
file_path = r'cleaned_data/fact_stores_with_info.csv'

print(f"Loading data from {file_path}...")
df = pd.read_csv(file_path)

df = df[df['mall_id'] == 22]

# Convert date
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
# Create Period column for aggregation
df['month_period'] = df['date'].dt.to_period('W')

print("Data loaded. Rows:", len(df))

Loading data from cleaned_data/fact_stores_with_info.csv...
Data loaded. Rows: 31184


## 2. Data Cleaning and Metric Calculation

**Methodology:**
1.  **Absolute Metrics**: Standard calculations (e.g., Sales / GLA).
2.  **Relative Indices**: Ratio of Store Density to Mall Average Density.
    *   `Index > 1.0`: Outperforming Mall Avg.
    *   `Index < 1.0`: Underperforming Mall Avg.

In [3]:
# 1. Basic Cleaning
df = df.dropna(subset=['sales_eur', 'gla'])
df = df[(df['gla'] > 0)]

# Calculate basic store values
df['margin_eur'] = df['sales_eur'] - df['costs_eur']

# --- Absolute Metrics ---
df['sales_per_gla'] = df['sales_eur'] / df['gla']
df['people_in_per_gla'] = df['people_in'] / df['gla']
df['margin_per_gla'] = df['margin_eur'] / df['gla']
# Capture Rate (Absolute)
df['capture_rate'] = df.apply(lambda x: x['people_in'] / x['people_window_flow'] if x['people_window_flow'] > 0 else None, axis=1)


# --- Relative Metrics Calculation ---

# 2. Mall-Level Aggregation (Monthly)
print("Calculating Mall Aggregates...")
mall_stats = df.groupby(['mall_id', 'month_period']).agg(
    mall_total_sales=('sales_eur', 'sum'),
    mall_total_footfall=('people_in', 'sum'),
    mall_total_margin=('margin_eur', 'sum'),
    mall_total_gla=('gla', 'sum')
).reset_index()

# 3. Merge back
df = df.merge(mall_stats, on=['mall_id', 'month_period'], how='left')

# 4. Calculate Relative Indices
# Densities
mall_sales_density = df['mall_total_sales'] / df['mall_total_gla']
mall_footfall_density = df['mall_total_footfall'] / df['mall_total_gla']
mall_margin_density = df['mall_total_margin'] / df['mall_total_gla']

# Indices (Store Density / Mall Density)
df['relative_sales_index'] = df['sales_per_gla'] / mall_sales_density
df['relative_footfall_index'] = df['people_in_per_gla'] / mall_footfall_density
df['relative_margin_index'] = df['margin_per_gla'] / mall_margin_density
# Dwell Index
df['relative_dwell_time_index'] = df['store_average_dwell_time'] / df['shopping_average_dwell_time']

print("Metrics calculated. Sample rows:")
cols_to_show = ['store_code', 'mall_id', 'sales_per_gla', 'relative_sales_index']
display(df[cols_to_show].head())

Calculating Mall Aggregates...
Metrics calculated. Sample rows:


Unnamed: 0,store_code,mall_id,sales_per_gla,relative_sales_index
0,1092470,22,535.64635,0.778485
1,1088229,22,1672.6825,2.431005
2,1046340,22,1451.24577,2.109178
3,1095669,22,1131.104,1.643898
4,1095953,22,2037.57188,2.961319


## 3. Descriptive Statistics

In [4]:
def generate_stats(group_col):
    unique_stores = df.drop_duplicates(subset=['store_code'])
    
    stats = unique_stores.groupby(group_col).agg(
        store_count=('store_code', 'count'),
        total_gla=('gla', 'sum'),
        avg_gla_per_store=('gla', 'mean')
    ).reset_index()
    
    return stats

print("### Stats by Category (bl2_label)")
display(generate_stats('bl2_label'))

print("### Stats by Block Type")
display(generate_stats('block_type'))

print("### Stats by GLA Category")
display(generate_stats('gla_category'))

### Stats by Category (bl2_label)


Unnamed: 0,bl2_label,store_count,total_gla,avg_gla_per_store
0,Accessories,3,275.0,91.666667
1,"Bars, Coffee and Tea breaks",4,300.0,75.0
2,Computer Products & Electronics games,1,181.0,181.0
3,Electronics and household appliances,1,63.0,63.0
4,Family Fashion,11,17705.0,1609.545455
5,Fast Meal,14,3272.0,233.714286
6,Furniture & Interior Design,2,854.0,427.0
7,Gourmet Food Products,2,78.0,39.0
8,Gym,1,1974.0,1974.0
9,Hair & Body,2,161.0,80.5


### Stats by Block Type


Unnamed: 0,block_type,store_count,total_gla,avg_gla_per_store
0,CELL,94,53175.0,565.691489
1,KIOSK,11,585.5,53.227273


### Stats by GLA Category


Unnamed: 0,gla_category,store_count,total_gla,avg_gla_per_store
0,LARGE UNITS,4,26699.0,6674.75
1,MSU,13,13273.0,1021.0
2,SMALL UNITS,88,13788.5,156.6875


## 4. Trend Analysis (Absolute & Relative)
Plots are grouped by Category Dimension. For each dimension, we see both Absolute and Relative metrics.

In [5]:
def plot_interactive_trend(dataframe, metric, metric_name, group_col):
    # Aggregate
    df_plot = dataframe.dropna(subset=[metric, group_col])
    import numpy as np
    df_plot = df_plot[~df_plot[metric].isin([np.inf, -np.inf])]
    
    df_agg = df_plot.groupby(['month_period', group_col])[metric].mean().reset_index()
    df_agg['date'] = df_agg['month_period'].dt.to_timestamp()
    
    df_agg = df_agg.sort_values('date')
    
    fig = px.line(df_agg, x='date', y=metric, color=group_col, markers=True,
                  title=f'{metric_name} by {group_col}',
                  labels={metric: metric_name, 'date': 'Date', group_col: group_col})
    
    if 'Index' in metric_name:
        fig.add_hline(y=1.0, line_dash="dot", annotation_text="Mall Average (1.0)", annotation_position="bottom right")
    
    fig.update_layout(hovermode="x unified")
    fig.show()

# COMBINED Metrics List
metrics = [
    # Sales
    ('sales_per_gla', 'Sales (EUR) per GLA (Abs)'),
    ('relative_sales_index', 'Relative Sales Index (Store/Mall)'),
    # Footfall
    ('people_in_per_gla', 'Footfall per GLA (Abs)'),
    ('relative_footfall_index', 'Relative Footfall Index (Store/Mall)'),
    # Margin
    ('margin_per_gla', 'Margin (EUR) per GLA (Abs)'),
    ('relative_margin_index', 'Relative Margin Index (Store/Mall)'),
    # Dwell
    ('store_average_dwell_time', 'Average Dwell Time (min) (Abs)'),
    ('relative_dwell_time_index', 'Relative Dwell Time Index'),
    # Capture - Absolute only applies well here
    ('capture_rate', 'Capture Rate (People In / Flow)')
]

dimensions = [
    ('bl2_label', 'Store Category'),
    ('block_type', 'Block Type'),
    ('gla_category', 'GLA Category')
]

for dim_col, dim_name in dimensions:
    print(f"\n{'#'*20} TRENDS BY {dim_name.upper()} ({dim_col}) {'#'*20}\n")
    for metric, metric_name in metrics:
        plot_interactive_trend(df, metric, metric_name, dim_col)


#################### TRENDS BY STORE CATEGORY (bl2_label) ####################




#################### TRENDS BY BLOCK TYPE (block_type) ####################




#################### TRENDS BY GLA CATEGORY (gla_category) ####################



## 5. Impact of SRI Score Over Time (Absolute & Relative)
Analyzing performance trends by `sri_bucket` for both absolute and relative metrics.

In [6]:
# Filter for SRI Analysis
df_sri = df.dropna(subset=['sri_score']).copy()
df_sri['sri_bucket'] = pd.cut(df_sri['sri_score'], bins=[0, 20, 40, 60, 80, 100], labels=['0-20', '20-40', '40-60', '60-80', '80-100'])

for metric, label in metrics:
    plot_interactive_trend(df_sri, metric, label, 'sri_bucket')





































## 6. Statistical Analysis: SRI Impact (Absolute & Relative)
Regression to check if SRI impacts Absolute Performance levels/trends AND Relative Outperformance.

In [7]:
# df_sri['date_numeric'] = (df_sri['date'] - df_sri['date'].min()).dt.days

# def run_regression_advanced(metric, metric_name):
#     print(f"--- Regression for {metric_name} ---")
#     import numpy as np
#     data_reg = df_sri.dropna(subset=[metric, 'bl1_label'])
#     data_reg = data_reg[~data_reg[metric].isin([np.inf, -np.inf])]
    
#     model = smf.ols(f'{metric} ~ sri_score * date_numeric + C(bl1_label)', data=data_reg).fit()
    
#     print(model.summary())
#     print("\n" + "="*80 + "\n")

# for metric, label in metrics:
#     run_regression_advanced(metric, label)

## 7. Statistical Analysis: Unified Regression of Categories
Independent impact of Store Categories on both Absolute Values and Relative Indices.

In [8]:
# df['date_numeric'] = (df['date'] - df['date'].min()).dt.days

# def run_unified_regression(metric, metric_name):
#     print(f"--- Unified Regression for {metric_name} ---")
#     import numpy as np
#     data_reg = df.dropna(subset=[metric, 'bl1_label', 'block_type', 'gla_category'])
#     data_reg = data_reg[~data_reg[metric].isin([np.inf, -np.inf])]

#     formula = f'{metric} ~ date_numeric * (C(bl1_label) + C(block_type) + C(gla_category))'
    
#     model = smf.ols(formula, data=data_reg).fit()
    
#     print(model.summary())
#     print("\n" + "="*80 + "\n")

# print(f"\n{'#'*20} UNIFIED REGRESSION ANALYSIS {'#'*20}\n")
# for metric, label in metrics:
#     run_unified_regression(metric, label)