# Impact of SRI Score on Store Performance

This notebook investigates the relationship between the Socially Responsible Investment (SRI) score and store performance metrics (Sales per GLA, Footfall per GLA).

## 1. Setup and Data Loading

In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import statsmodels.formula.api as smf
import numpy as np

# Set default plotly template
pio.templates.default = "plotly_white"

# Load Data
file_path = r'cleaned_data/fact_stores_with_info.csv'
print(f"Loading data from {file_path}...")
df = pd.read_csv(file_path)

# Filter for Mall 22
# df_mall22 = df.copy()
df_mall22 = df[df['mall_id'] == 22].copy()

# Date Conversion
df_mall22['date'] = pd.to_datetime(df_mall22['date'], dayfirst=True)
df_mall22['month_period'] = df_mall22['date'].dt.to_period('M')

print(f"Data filtered for Mall 22. Rows: {len(df_mall22)}")
df_mall22.head()

Loading data from cleaned_data/fact_stores_with_info.csv...
Data filtered for Mall 22. Rows: 31184


Unnamed: 0,date,mall_id,block_id,store_code,retailer_id,people_in,people_window_flow,store_average_dwell_time,store_median_dwell_time,shopping_average_dwell_time,...,store_name,cur_code,sales_r12m,total_costs_r12m,is_subsidized,fx_rate,sales_eur,costs_eur,sri_score,month_period
38,2024-07-01,22,6681,1092470,350.0,1190,12294,2.467935,2.122608,50.322049,...,ONLY,EUR,107129.27,35134.58683,0.0,1.0,107129.27,35134.58683,36.0,2024-07
99,2024-07-01,22,16723,1088229,81603.0,204,16140,,,,...,MANGOS JUICE & FOOD,EUR,70252.665,12659.7204,0.0,1.0,70252.665,12659.7204,,2024-07
118,2024-07-01,22,6687,1046340,513.0,3132,37769,2.415688,2.077671,56.615361,...,FOOT LOCKER,EUR,354103.968,52602.68163,0.0,1.0,354103.968,52602.68163,32.3,2024-07
124,2024-07-01,22,16716,1095669,8636.0,170,11994,,,,...,STARBUCKS COFFEE,EUR,27146.496,3987.09395,0.0,1.0,27146.496,3987.09395,,2024-07
126,2024-07-01,22,16721,1095953,55270.0,430,17994,,,,...,YOGURT FACTORY,EUR,50939.297,8856.81888,0.0,1.0,50939.297,8856.81888,,2024-07


## 2. Data Cleaning & Metric Calculation

We filter out rows with missing sales, GLA, or SRI scores to ensure valid analysis. We calculate standard densities (per GLA).

In [2]:
# Remove rows with invalid key metrics
df_clean = df_mall22.dropna(subset=['people_in', 'gla', 'sri_score'])
df_clean = df_clean[df_clean['gla'] > 0]

# Calculate Performance Metrics
df_clean['sales_per_gla'] = df_clean['sales_eur'] / df_clean['gla']
df_clean['footfall_per_gla'] = df_clean['people_in'] / df_clean['gla']
df_clean['capture_rate'] = df_clean.apply(lambda x: x['people_in'] / x['people_window_flow'] if x['people_window_flow'] > 0 else None, axis=1)

# Ensure Dwell Time has numeric values
df_clean = df_clean.dropna(subset=['store_average_dwell_time'])

# Create SRI Buckets for grouping
df_clean['sri_bucket'] = pd.cut(df_clean['sri_score'], bins=[0, 20, 40, 60, 80, 100], labels=['0-20', '20-40', '40-60', '60-80', '80-100'])

print(f"Cleaned data rows: {len(df_clean)}")
df_clean[['store_code', 'sri_score', 'sales_per_gla', 'footfall_per_gla', 'capture_rate', 'store_average_dwell_time']].head()

Cleaned data rows: 17373


Unnamed: 0,store_code,sri_score,sales_per_gla,footfall_per_gla,capture_rate,store_average_dwell_time
38,1092470,36.0,535.64635,5.95,0.096795,2.467935
118,1046340,32.3,1451.24577,12.836066,0.082925,2.415688
199,1311890,24.0,357.335783,1.564498,0.1458,9.271159
249,3089,30.9,400.456066,1.584362,0.040957,5.832578
256,1068192,30.1,752.732534,6.642241,0.041494,3.520554


## 3. Visual Analysis

### 3.1 Scatter Plots: SRI Score vs Performance
We plot the relationship directly. We aggregate by store to get an average view per store, reducing noise from weekly fluctuations.

In [3]:
# Aggregate by Store to see store-level relationship
store_agg = df_clean.groupby(['store_code', 'bl1_label', 'sri_bucket'], observed=True).agg(
    avg_sri_score=('sri_score', 'mean'),
    avg_sales_per_gla=('sales_per_gla', 'mean'),
    avg_footfall_per_gla=('footfall_per_gla', 'mean'),
    avg_capture_rate=('capture_rate', 'mean'),
    avg_dwell_time=('store_average_dwell_time', 'mean')
).reset_index()

# 1. SRI vs Sales Density (Added back)
fig_sales = px.scatter(store_agg, x='avg_sri_score', y='avg_sales_per_gla', color='bl1_label',
                       title='SRI Score vs Sales Density (per Store)',
                       trendline='ols', # Add trendline to see general direction
                       labels={'avg_sri_score': 'Average SRI Score', 'avg_sales_per_gla': 'Avg Sales / GLA (EUR)'})
fig_sales.show()

# 2. SRI vs Footfall Density
fig_footfall = px.scatter(store_agg, x='avg_sri_score', y='avg_footfall_per_gla', color='bl1_label',
                          title='SRI Score vs Footfall Density (per Store)',
                          trendline='ols',
                          labels={'avg_sri_score': 'Average SRI Score', 'avg_footfall_per_gla': 'Avg Footfall / GLA'})
fig_footfall.show()

# 3. SRI vs Capture Rate
fig_capture = px.scatter(store_agg, x='avg_sri_score', y='avg_capture_rate', color='bl1_label',
                       title='SRI Score vs Capture Rate (per Store)',
                       trendline='ols', 
                       labels={'avg_sri_score': 'Average SRI Score', 'avg_capture_rate': 'Capture Rate'})
fig_capture.show()

# 4. SRI vs Dwell Time
fig_dwell = px.scatter(store_agg, x='avg_sri_score', y='avg_dwell_time', color='bl1_label',
                       title='SRI Score vs Average Dwell Time (per Store)',
                       trendline='ols', 
                       labels={'avg_sri_score': 'Average SRI Score', 'avg_dwell_time': 'Avg Dwell Time (min)'})
fig_dwell.show()

In [4]:
# Filter data for specific sectors
luxury_data = store_agg[store_agg['bl1_label'] == 'Luxury']
sport_data = store_agg[store_agg['bl1_label'] == 'Sport']

# 1. Luxury: SRI vs Sales Density
fig_luxury = px.scatter(luxury_data, x='avg_sri_score', y='avg_sales_per_gla',
                        title='Luxury: SRI Score vs Sales Density',
                        trendline='ols',
                        color_discrete_sequence=['gold'],  # Set color for Luxury
                        labels={'avg_sri_score': 'Average SRI Score', 'avg_sales_per_gla': 'Avg Sales / GLA (EUR)'},
                        width=1200, height=400) # flattened
fig_luxury.show()

# 2. Sport: SRI vs Sales Density
fig_sport = px.scatter(sport_data, x='avg_sri_score', y='avg_sales_per_gla',
                       title='Sport: SRI Score vs Sales Density',
                       trendline='ols',
                       color_discrete_sequence=['blue'],  # Set different color for Sport
                       labels={'avg_sri_score': 'Average SRI Score', 'avg_sales_per_gla': 'Avg Sales / GLA (EUR)'},
                       width=1200, height=400) # flattened
fig_sport.show()

### 3.2 SRI Buckets Distribution
Analyzing the distribution of performance metrics across SRI buckets.

In [5]:
# Box Plot for Sales Density by SRI Bucket (Added back)
fig_bp_sales = px.box(store_agg, x='sri_bucket', y='avg_sales_per_gla', 
                      title='Sales Density Distribution either by SRI Bucket',
                      color='sri_bucket',
                      points="all") # show all points
fig_bp_sales.show()

# Box Plot for Footfall Density by SRI Bucket
fig_bp_ff = px.box(store_agg, x='sri_bucket', y='avg_footfall_per_gla', 
                   title='Footfall Density Distribution by SRI Bucket',
                   color='sri_bucket',
                   points="all")
fig_bp_ff.show()

# Box Plot for Capture Rate by SRI Bucket
fig_bp_cr = px.box(store_agg, x='sri_bucket', y='avg_capture_rate', 
                      title='Capture Rate Distribution by SRI Bucket',
                      color='sri_bucket',
                      points="all") # show all points
fig_bp_cr.show()

# Box Plot for Dwell Time by SRI Bucket
fig_bp_dt = px.box(store_agg, x='sri_bucket', y='avg_dwell_time', 
                      title='Dwell Time Distribution by SRI Bucket',
                      color='sri_bucket',
                      points="all") # show all points
fig_bp_dt.show()

## 4. Statistical Impact Analysis

### 4.1 Correlation Matrix
What is the linear correlation between SRI Score and our key metrics?

In [6]:
# Calculate correlation on store-level averages
correlation_matrix = store_agg[['avg_sri_score', 'avg_sales_per_gla', 'avg_footfall_per_gla', 'avg_capture_rate', 'avg_dwell_time']].corr()
print("Correlation Matrix (Store Level averages):")
display(correlation_matrix)

# Heatmap for Slides
fig_corr = px.imshow(correlation_matrix, 
                     text_auto='.2f', 
                     title="Correlation Matrix: SRI vs Performance Metrics",
                     color_continuous_scale='RdBu_r', # Red-Blue diverging scale
                     zmin=-1, zmax=1) # Fix range [-1, 1] for correlation

fig_corr.update_layout(
    xaxis_title=None,
    yaxis_title=None,
    title_font_size=20,
    width=800,
    height=600
)
fig_corr.show()

Correlation Matrix (Store Level averages):


Unnamed: 0,avg_sri_score,avg_sales_per_gla,avg_footfall_per_gla,avg_capture_rate,avg_dwell_time
avg_sri_score,1.0,0.239578,-0.164489,0.129599,0.180046
avg_sales_per_gla,0.239578,1.0,0.628267,-0.033474,-0.001966
avg_footfall_per_gla,-0.164489,0.628267,1.0,-0.114123,-0.167623
avg_capture_rate,0.129599,-0.033474,-0.114123,1.0,0.634221
avg_dwell_time,0.180046,-0.001966,-0.167623,0.634221,1.0


### 4.2 Regression Analysis
Using OLS regression to control for Category (Sector) effects, as different sectors naturally have different sales/footfall densities and different typical SRI scores.

Model: `Performance ~ SRI_Score + Category`

In [7]:
# Standardize the continuous variables
cols_to_standardize = ['avg_sri_score', 'avg_sales_per_gla', 'avg_footfall_per_gla', 'avg_capture_rate', 'avg_dwell_time']

for col in cols_to_standardize:
    store_agg[f'z_{col}'] = (store_agg[col] - store_agg[col].mean()) / store_agg[col].std()

print("Standardized columns added:")
print([f'z_{col}' for col in cols_to_standardize])
store_agg[[f'z_{col}' for col in cols_to_standardize]].head()

Standardized columns added:
['z_avg_sri_score', 'z_avg_sales_per_gla', 'z_avg_footfall_per_gla', 'z_avg_capture_rate', 'z_avg_dwell_time']


Unnamed: 0,z_avg_sri_score,z_avg_sales_per_gla,z_avg_footfall_per_gla,z_avg_capture_rate,z_avg_dwell_time
0,0.732971,-0.731376,0.463004,-0.199576,0.141006
1,1.847483,1.150844,-0.041699,0.325188,1.597632
2,1.385509,-0.886744,-0.59013,0.206408,-0.067611
3,-0.56633,0.373823,0.531315,-0.532036,0.206711
4,0.086208,-0.887529,-1.062493,-0.535823,-0.434246


In [8]:
# Regression for Sales Density
print("--- OLS Regression Results: Sales per GLA (Standardized) ---")
model_sales = smf.ols('z_avg_sales_per_gla ~ z_avg_sri_score + C(bl1_label)', data=store_agg).fit()
print(model_sales.summary())

print("\n" + "="*80 + "\n")

# Regression for Footfall Density
print("--- OLS Regression Results: Footfall per GLA (Standardized) ---")
model_footfall = smf.ols('z_avg_footfall_per_gla ~ z_avg_sri_score + C(bl1_label)', data=store_agg).fit()
print(model_footfall.summary())

print("\n" + "="*80 + "\n")

# Regression for Capture Rate
print("--- OLS Regression Results: Capture Rate (Standardized) ---")
model_capture = smf.ols('z_avg_capture_rate ~ z_avg_sri_score + C(bl1_label)', data=store_agg).fit()
print(model_capture.summary())

print("\n" + "="*80 + "\n")

# Regression for Dwell Time
print("--- OLS Regression Results: Dwell Time (Standardized) ---")
model_dwell = smf.ols('z_avg_dwell_time ~ z_avg_sri_score + C(bl1_label)', data=store_agg).fit()
print(model_dwell.summary())

--- OLS Regression Results: Sales per GLA (Standardized) ---
                             OLS Regression Results                            
Dep. Variable:     z_avg_sales_per_gla   R-squared:                       0.384
Model:                             OLS   Adj. R-squared:                  0.308
Method:                  Least Squares   F-statistic:                     5.086
Date:                 Sun, 18 Jan 2026   Prob (F-statistic):           0.000398
Time:                         18:55:08   Log-Likelihood:                -65.400
No. Observations:                   56   AIC:                             144.8
Df Residuals:                       49   BIC:                             159.0
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------

## 5. Temporal Trend Analysis by Sector & SRI
Is a high SRI score helping performance over time? We analyze this by looking at weekly trends for each sector, segmented by SRI bucket.

In [9]:
# 1. ensure we have weekly period
df_clean['week_period'] = df_clean['date'].dt.to_period('W')

# 2. Aggregate by Week, Sector (bl1_label), and SRI Bucket
weekly_sector_trends = df_clean.groupby(['week_period', 'bl1_label', 'sri_bucket'], observed=True).agg(
    avg_footfall_per_gla=('footfall_per_gla', 'mean'),
    avg_capture_rate=('capture_rate', 'mean'),
    avg_dwell_time=('store_average_dwell_time', 'mean'),
    store_count=('store_code', 'nunique')
).reset_index()

# Convert period to timestamp for plotting
weekly_sector_trends['date'] = weekly_sector_trends['week_period'].dt.to_timestamp()

# 3. Plotting Function
def plot_sector_trends(metric, metric_name):
    # Get list of sectors
    sectors = weekly_sector_trends['bl1_label'].unique()
    
    for sector in sectors:
        data_sector = weekly_sector_trends[weekly_sector_trends['bl1_label'] == sector]
        
        # Only plot if we have data
        if len(data_sector) == 0:
            continue
            
        fig = px.line(data_sector, x='date', y=metric, color='sri_bucket',
                      title=f'{metric_name} Trend by SRI Bucket - Sector: {sector}',
                      labels={metric: metric_name, 'date': 'Date', 'sri_bucket': 'SRI Score Bucket'},
                      markers=True)
        fig.update_layout(hovermode="x unified")
        fig.show()

# Run for Footfall Density
print("### Footfall Density Trends by Sector & SRI Bucket")
plot_sector_trends('avg_footfall_per_gla', 'Avg Footfall / GLA')

# Run for Capture Rate
print("### Capture Rate Trends by Sector & SRI Bucket")
plot_sector_trends('avg_capture_rate', 'Avg Capture Rate')

# Run for Dwell Time
print("### Dwell Time Trends by Sector & SRI Bucket")
plot_sector_trends('avg_dwell_time', 'Avg Dwell Time')

### Footfall Density Trends by Sector & SRI Bucket


### Capture Rate Trends by Sector & SRI Bucket


### Dwell Time Trends by Sector & SRI Bucket


## 6. Stability Analysis
Do stores with higher SRI scores display more stable performance? We calculate the Coefficient of Variation (CV) for Footfall for each store and compare it against their SRI Score.

$$ CV = \frac{\sigma}{\mu} $$
_Lower CV indicates higher stability._

In [10]:
# Calculate CV per store (using Footfall)
store_stability = df_clean.groupby(['store_code', 'bl1_label', 'sri_bucket'], observed=True).agg(
    avg_footfall_per_gla=('footfall_per_gla', 'mean'),
    std_footfall_per_gla=('footfall_per_gla', 'std'),
    avg_sri_score=('sri_score', 'mean')
).reset_index()

store_stability['cv_footfall'] = store_stability['std_footfall_per_gla'] / store_stability['avg_footfall_per_gla']

# Plot Stability vs SRI
fig_stability = px.scatter(store_stability, x='avg_sri_score', y='cv_footfall', color='bl1_label',
                           title='Stability Analysis: Footfall Volatility (CV) vs SRI Score',
                           labels={'avg_sri_score': 'SRI Score', 'cv_footfall': 'Coefficient of Variation (Footfall)'},
                           trendline='ols')
fig_stability.show()
print("Correlation between SRI and Footfall Volatility:", store_stability['avg_sri_score'].corr(store_stability['cv_footfall']))

Correlation between SRI and Footfall Volatility: -0.0054266016485828746


## 7. Sector Interaction Analysis
Does the impact of SRI score depend on the sector? We run an interaction regression to find out.

Model: `Capture_Rate ~ SRI_Score * Sector` (Focusing on Capture Rate as a key efficiency metric)

In [11]:
# Interaction Regression
print("--- OLS Regression with Interaction: SRI * Sector (on Capture Rate) ---")
model_interaction = smf.ols('avg_capture_rate ~ avg_sri_score * C(bl1_label)', data=store_agg).fit()
print(model_interaction.summary())

--- OLS Regression with Interaction: SRI * Sector (on Capture Rate) ---
                            OLS Regression Results                            
Dep. Variable:       avg_capture_rate   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.117
Method:                 Least Squares   F-statistic:                     1.730
Date:                Sun, 18 Jan 2026   Prob (F-statistic):              0.103
Time:                        18:55:11   Log-Likelihood:                 67.465
No. Observations:                  56   AIC:                            -112.9
Df Residuals:                      45   BIC:                            -90.65
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------