In [108]:
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
import numpy as np
import statsmodels.api as sm


In [4]:
reviews = pd.read_csv("reviews.csv")
findata_a = pd.read_csv("findata_Total_Score_plus.csv")
findata_b = pd.read_csv("findata_Reduced_Score.csv")
crawler = pd.read_csv("crawler.csv")

In [25]:
# Convert the "createdAt" column to integers for comparison
reviews['year'] = reviews["createdAt"].str[:4].astype(int)

# Drop rows where year is less than 2014 or greater than 2022
reviews_2014 = reviews[(reviews['year'] >= 2014) & (reviews['year'] <= 2022)]

In [116]:
# join reviews and findata
reviews_2014["common_countryCode"].unique()

array(['de', 'ch', 'at'], dtype=object)

In [147]:
# set inputs
score = reviews_2014["total_score"].values - np.nanmean(reviews_2014["total_score"].values)
salary = (reviews_2014["salary_score"].values - np.nanmean(reviews_2014["salary_score"].values))/np.nanstd(reviews_2014["salary_score"].values)
year = pd.to_numeric(reviews_2014["createdAt"].str[:4]).values
leadership = (reviews_2014["leadership_score"].values - np.nanmean(reviews_2014["leadership_score"].values))/np.nanstd(reviews_2014["leadership_score"].values)
country = reviews_2014["common_countryCode"]

score[np.isnan(score)] = 0
salary[np.isnan(salary)] = 0
leadership[np.isnan(leadership)] = 0


#### Pooled Regression: time as continous variable with-/out moderation

In [148]:
def PooledReg_cont_time_moderation(y, x, t, m):
    """
    Perform a panel regression with time and moderation.

    Parameters:
    y (numpy.ndarray): Dependent variable.
    x (numpy.ndarray): Independent variable.
    t (numpy.ndarray): Time variable.
    m (numpy.ndarray): Moderator variable.

    Returns:
    statsmodels.regression.linear_model.RegressionResultsWrapper: Regression results.
    """
    # Create a DataFrame from the input arrays
    data = {'y': score, 'x': salary, 't': year, 'm': leadership}
    df = pd.DataFrame(data)

    # Define the regression formula including moderation
    formula = 'y ~ x + t + m + x*m + x*t'

    # Fit the panel regression model
    model = sm.OLS.from_formula(formula, data=df).fit()

    return model

def PooledReg_cont_time(y, x, t):
    data = {'y': score, 'x': salary, 't': year}
    df = pd.DataFrame(data)

    # Define the regression formula including moderation
    formula = 'y ~ x + t + x*t'

    # Fit the panel regression model
    model = sm.OLS.from_formula(formula, data=df).fit()

    return model


In [149]:
print(PooledReg_cont_time_moderation(y, x, t, m).summary())
print(PooledReg_cont_time(y, x, t).summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.221
Model:                            OLS   Adj. R-squared:                  0.221
Method:                 Least Squares   F-statistic:                     1001.
Date:                Tue, 26 Sep 2023   Prob (F-statistic):               0.00
Time:                        01:11:05   Log-Likelihood:                -14982.
No. Observations:               17604   AIC:                         2.998e+04
Df Residuals:                   17598   BIC:                         3.002e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.4402      3.876     -0.630      0.5

#### Panel Regression: time as categorical variables with-/out moderation

In [152]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Create a DataFrame with the preprocessed data
data = pd.DataFrame({'score': score, 'salary': salary, 'year': year, 'leadership': leadership, 'country': country})

# Create entity (country) fixed effects using dummy variables
country_dummies = pd.get_dummies(data['country'], prefix='country')
data = pd.concat([data, country_dummies], axis=1)

# Convert "year" to a categorical variable
data['year'] = data['year'].astype('category')

# Define the regression formula with fixed effects, interaction terms, and year effects
formula = 'score ~ salary + leadership + C(year) + salary:leadership + salary:country + leadership:country + salary:leadership:country + ' + ' + '.join(country_dummies.columns[1:])  # Exclude the first dummy variable

# Fit the fixed effects panel regression model
model = sm.OLS.from_formula(formula, data=data)
results = model.fit()

# Print regression results
summary = results.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.229
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     274.3
Date:                Tue, 26 Sep 2023   Prob (F-statistic):               0.00
Time:                        01:12:12   Log-Likelihood:                -14901.
No. Observations:               17604   AIC:                         2.984e+04
Df Residuals:                   17584   BIC:                         3.000e+04
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

Panel Regression 3: 