## Panel Regression Analysis using PanelOLS
Here we are  conducting a panel regression analysis using the panelols package. We are looking into the relationships between the different SDGs and Environmental Intensity (Sales)

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from linearmodels.panel import RandomEffects
from linearmodels.panel import PanelOLS

## Import the Excel

In [4]:
df = pd.read_csv("Sales_no_outliers_brewery_only.csv")
print(df.shape)
df.head()

(91, 37)


Unnamed: 0.1,Unnamed: 0,Year,Company Name,Country,GICS Sub-Industry,Industry (Exiobase),Environmental Intensity (Sales),Environmental Intensity (Op Inc),Revenue,Operating Income,...,SDG 6,SDG 12.2,SDG 14.1,SDG 14.2,SDG 14.3,SDG 14.c,SDG 15.1,SDG 15.2,SDG 15.5,% Imputed
0,516,2019,BUDWEISER BREWING CO,CAYMAN ISLANDS,Brewers,Manufacture of beverages,-7.019612,-37.638941,6582921000.0,1227706000.0,...,-248494000.0,-21847.771988,-140.450129,-28.052546,-25278.227923,-167.406661,1886.414782,1886.414782,-8773.42972,0.01749
1,517,2019,MOLSON COORS BEVERAGE CO,UNITED STATES OF AMERICA,Brewers,Manufacture of beverages,-6.596233,-46.522794,10579400000.0,1500000000.0,...,-397774500.0,-77571.792798,-1113.20836,-326.988191,-37024.568588,-1326.865949,15176.840942,15176.840942,-12261.768244,0.059202
2,519,2019,COMPANIA CERVECERIAS UNIDAS,CHILE,Brewers,Manufacture of beverages,-6.415653,-54.146661,2423979000.0,287209000.0,...,-94855710.0,-65916.007408,-104.142317,-467.109898,-5751.475432,-124.130306,-1430.679992,-55213.362037,-2224.738635,0.084882
3,523,2019,ANHEUSER-BUSCH INBEV,BELGIUM,Brewers,Manufacture of beverages,-3.388736,-11.492977,52484640000.0,15475240000.0,...,-190914100.0,-222519.248882,-3914.291085,-325.78914,-190769.424529,-4665.559245,56659.806086,56659.806086,-64862.799133,0.080199
4,526,2019,CARLSBERG A/S,DENMARK,Brewers,Manufacture of beverages,-2.414563,-16.054846,9890572000.0,1487489000.0,...,-39026790.0,-366663.271634,-1480.4494,-288.844325,-27951.8689,-1764.5914,19253.589023,19132.282417,-8148.334758,0.029599


## Helper Functions
### Constant variables
Set of variables that allow us to keep track of which dependent and independent variable's column name

In [3]:
harvard_set = ["Working Capacity", "Fish Production Capacity", "Crop Production Capacity",
                "Meat Production Capacity", "Biodiversity", "Abiotic Resources",
                "Water production capacity (Drinking water & Irrigation Water)", "Wood Production Capacity"]

sdg_set = ["SDG 1.5", "SDG 2.1", "SDG 2.2", "SDG 2.3", "SDG 2.4", "SDG 3.3", 
            "SDG 3.4", "SDG 3.9", "SDG 6", "SDG 12.2", "SDG 14.1", "SDG 14.2", 
            "SDG 14.3", "SDG 14.c", "SDG 15.1", "SDG 15.2", "SDG 15.5"]

sdg_finalset = ["SDG 1.5", "SDG 2.1", "SDG 2.2", "SDG 3.3", "SDG 3.4", "SDG 6", "SDG 15.2"]

depVar_EISales = "Environmental Intensity (Sales)"
depVar_EIOpInc = "Environmental Intensity (Op Inc)"
depVar_Revenue = "Revenue"
depVar_OpInc = "Operating Income"

### getXYVal()
Gets the independent(X) and dependent(y) variables to be used by the regressions

In [4]:
def getXYVal(selectedSet, dependent_variable, df_source):
    """_summary_
        Sets up the dependent and independent variables to be used by the regressions

    Args:
        selectedSet (list<string>): contains either the harvard targets or SDG targets
        dependent_variable (string): contains either the following: revenue, operating income, Environmental Intensity (Sales), or Environmental Intensity (Op Inc)
        df_source (dataframe): Contains the dataframe used to extract from.

    Returns:
        None
    """    
    df_source = df_source 
        
    X = df_source[selectedSet]
    X = sm.add_constant(X)
    
    y = df_source[dependent_variable]
    
    return X, y

### runPanelOLS()
Runs the panel regression with the settings passed as parameters.

In [5]:
def runPanelOLS(selectedSet, dependent_variable, df_source):
    """_summary_
        Runs the panel regression with the settings passed as parameters.

    Args:
        selectedSet (list<string>): contains either the harvard targets or SDG targets
        dependent_variable (string): contains either the following: revenue, operating income, Environmental Intensity (Sales), or Environmental Intensity (Op Inc)
        df_source (dataframe): Contains the dataframe used to extract from.

    Returns:
        None
    """
    df_source.set_index(['Company Name', 'Year'], inplace=True)
    
    X,y = getXYVal(selectedSet=selectedSet, dependent_variable=dependent_variable, df_source=df_source)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
    model = PanelOLS(y, X, check_rank=False)

    # Fit the model
    results = model.fit()
    
    # Print summary statistics
    print(results)

## Panel OLS
### All SDG

In [16]:
runPanelOLS(selectedSet=sdg_set, dependent_variable=depVar_EISales, df_source=df.copy())

                                 PanelOLS Estimation Summary                                 
Dep. Variable:     Environmental Intensity (Sales)   R-squared:                        0.9047
Estimator:                                PanelOLS   R-squared (Between):              0.9766
No. Observations:                               91   R-squared (Within):               0.3370
Date:                             Sat, Oct 07 2023   R-squared (Overall):              0.9047
Time:                                     17:17:13   Log-likelihood                   -65.114
Cov. Estimator:                         Unadjusted                                           
                                                     F-statistic:                      40.783
Entities:                                       12   P-value                           0.0000
Avg Obs:                                    7.5833   Distribution:                   F(17,73)
Min Obs:                                    1.0000          

### Selected SDG: "SDG 1.5", "SDG 2.1", "SDG 2.2", "SDG 3.3", "SDG 3.4", "SDG 6", "SDG 15.2"

In [17]:
runPanelOLS(selectedSet=sdg_finalset, dependent_variable=depVar_EISales, df_source=df.copy())

                                 PanelOLS Estimation Summary                                 
Dep. Variable:     Environmental Intensity (Sales)   R-squared:                        0.7983
Estimator:                                PanelOLS   R-squared (Between):              0.9199
No. Observations:                               91   R-squared (Within):              -0.1198
Date:                             Sat, Oct 07 2023   R-squared (Overall):              0.7983
Time:                                     17:17:19   Log-likelihood                   -99.247
Cov. Estimator:                         Unadjusted                                           
                                                     F-statistic:                      46.928
Entities:                                       12   P-value                           0.0000
Avg Obs:                                    7.5833   Distribution:                    F(7,83)
Min Obs:                                    1.0000          

### Run once for each SDG: "SDG 1.5", "SDG 2.1", "SDG 2.2", "SDG 3.3", "SDG 3.4", "SDG 6", "SDG 15.2"

In [8]:
for sdg in sdg_finalset:
    runPanelOLS(selectedSet=sdg, dependent_variable=depVar_EISales, df_source=df.copy())

                                 PanelOLS Estimation Summary                                 
Dep. Variable:     Environmental Intensity (Sales)   R-squared:                        0.0391
Estimator:                                PanelOLS   R-squared (Between):             -0.1131
No. Observations:                               91   R-squared (Within):               0.0457
Date:                             Sat, Oct 07 2023   R-squared (Overall):              0.0391
Time:                                     16:33:15   Log-likelihood                   -170.27
Cov. Estimator:                         Unadjusted                                           
                                                     F-statistic:                      3.6251
Entities:                                       12   P-value                           0.0601
Avg Obs:                                    7.5833   Distribution:                    F(1,89)
Min Obs:                                    1.0000          

### SDG with non-significant p-values: "SDG 1.5", "SDG 2.1", "SDG 2.2", "SDG 3.3"

In [9]:
runPanelOLS(selectedSet=["SDG 1.5", "SDG 2.1", "SDG 2.2", "SDG 3.3"], dependent_variable=depVar_EISales, df_source=df.copy())

                                 PanelOLS Estimation Summary                                 
Dep. Variable:     Environmental Intensity (Sales)   R-squared:                        0.1681
Estimator:                                PanelOLS   R-squared (Between):              0.1154
No. Observations:                               91   R-squared (Within):              -1.1444
Date:                             Sat, Oct 07 2023   R-squared (Overall):              0.1681
Time:                                     16:35:49   Log-likelihood                   -163.72
Cov. Estimator:                         Unadjusted                                           
                                                     F-statistic:                      4.3452
Entities:                                       12   P-value                           0.0030
Avg Obs:                                    7.5833   Distribution:                    F(4,86)
Min Obs:                                    1.0000          

### SDG with Significant p-values: "SDG 3.4", "SDG 6", "SDG 15.2"

In [12]:
runPanelOLS(selectedSet=["SDG 3.4", "SDG 6", "SDG 15.2"], dependent_variable=depVar_EISales, df_source=df.copy())

                                 PanelOLS Estimation Summary                                 
Dep. Variable:     Environmental Intensity (Sales)   R-squared:                        0.6463
Estimator:                                PanelOLS   R-squared (Between):              0.8015
No. Observations:                               91   R-squared (Within):              -0.3304
Date:                             Sat, Oct 07 2023   R-squared (Overall):              0.6463
Time:                                     17:16:17   Log-likelihood                   -124.80
Cov. Estimator:                         Unadjusted                                           
                                                     F-statistic:                      52.992
Entities:                                       12   P-value                           0.0000
Avg Obs:                                    7.5833   Distribution:                    F(3,87)
Min Obs:                                    1.0000          