In [11]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

In [12]:
df_den = pd.read_excel('denmark_final.xlsx')
df_no = pd.read_excel('norway_final.xlsx')


In [13]:
df_den_summed = df_den.select_dtypes(include=[np.number]).sum()
df_den_summed.head(15)

1999    16490
2000    15229
2001    17049
2002    14980
2003    16437
2004    15316
2005    16398
2006    15918
2007    16285
2008    13789
dtype: int64

In [14]:
df_no_summed = df_no.select_dtypes(include=[np.number]).sum()
df_no_summed.head(15)

1999    18968
2000    16350
2001    18120
2002    16007
2003    16632
2004    16463
2005    17775
2006    17287
2007    16106
2008    15458
dtype: int64

In [15]:
# Convert summed series to DataFrames and add metadata
df_den_total = pd.DataFrame([df_den_summed]).reset_index(drop=True)
df_den_total.insert(0, 'Country', 'Denmark')

df_no_total = pd.DataFrame([df_no_summed]).reset_index(drop=True)  
df_no_total.insert(0, 'Country', 'Norway')


# Force the column names to be strings and try again
df_den_total.columns = ['Country'] + [str(col) for col in df_den_total.columns[1:]]
df_no_total.columns = ['Country'] + [str(col) for col in df_no_total.columns[1:]]


# Now check common columns again
den_cols = set(df_den_total.columns) - {'Country'}
no_cols = set(df_no_total.columns) - {'Country'}
common_cols = sorted(den_cols.intersection(no_cols))

# Reorder both DataFrames to have the same column structure
df_den_aligned = df_den_total[['Country'] + common_cols]
df_no_aligned = df_no_total[['Country'] + common_cols]

# Now combine them
df_combined = pd.concat([df_den_aligned, df_no_aligned], ignore_index=True)
print("Combined (no NaN):")
print(df_combined)


# Melt to long format for regression
df_long = df_combined.melt(id_vars=['Country'], var_name='Year', value_name='Cases')

# Drop any rows with NaN values
df_long = df_long.dropna()

print("\nLong format data (after dropping NaN):")
print(df_long.head(20))

# Convert year to numeric and create policy variables
df_long['Year'] = pd.to_numeric(df_long['Year'])
df_long['Post_2004'] = (df_long['Year'] >= 2004).astype(int)
df_long['Norway'] = (df_long['Country'] == 'Norway').astype(int)
df_long['Treated'] = df_long['Norway'] * df_long['Post_2004']

print("\nFinal data for DiD regression:")
print(df_long.head(20))

Combined (no NaN):
   Country   1999   2000   2001   2002   2003   2004   2005   2006   2007  \
0  Denmark  16490  15229  17049  14980  16437  15316  16398  15918  16285   
1   Norway  18968  16350  18120  16007  16632  16463  17775  17287  16106   

    2008  
0  13789  
1  15458  

Long format data (after dropping NaN):
    Country  Year  Cases
0   Denmark  1999  16490
1    Norway  1999  18968
2   Denmark  2000  15229
3    Norway  2000  16350
4   Denmark  2001  17049
5    Norway  2001  18120
6   Denmark  2002  14980
7    Norway  2002  16007
8   Denmark  2003  16437
9    Norway  2003  16632
10  Denmark  2004  15316
11   Norway  2004  16463
12  Denmark  2005  16398
13   Norway  2005  17775
14  Denmark  2006  15918
15   Norway  2006  17287
16  Denmark  2007  16285
17   Norway  2007  16106
18  Denmark  2008  13789
19   Norway  2008  15458

Final data for DiD regression:
    Country  Year  Cases  Post_2004  Norway  Treated
0   Denmark  1999  16490          0       0        0
1    Norway  

In [16]:
# Run Difference-in-Differences regression
# Model: Cases = α + β1*Norway + β2*Post2004 + β3*Norway*Post2004 + ε
X = df_long[['Norway', 'Post_2004', 'Treated']].copy()
X = sm.add_constant(X)
y = df_long['Cases']

model = sm.OLS(y, X).fit()
print("\n" + "="*60)
print("DIFFERENCE-IN-DIFFERENCES RESULTS")
print("="*60)
print(model.summary())

# Extract key results
did_coefficient = model.params['Treated']
did_pvalue = model.pvalues['Treated']
did_std_err = model.bse['Treated']

print(f"\n" + "="*60)
print("KEY FINDINGS:")
print("="*60)
print(f"DiD Estimate: {did_coefficient:.2f}")
print(f"Standard Error: {did_std_err:.2f}")
print(f"P-value: {did_pvalue:.4f}")
print(f"Significant at 5% level: {'Yes' if did_pvalue < 0.05 else 'No'}")

effect_direction = "increased" if did_coefficient > 0 else "decreased"
print(f"\nInterpretation: The Norwegian policy {effect_direction} hospital cases")
print(f"by {abs(did_coefficient):.2f} cases on average compared to Denmark.")

# Additional analysis: Check pre-treatment trends
print(f"\n" + "="*60)
print("PRE-TREATMENT COMPARISON (1999-2003):")
print("="*60)
pre_treatment = df_long[df_long['Post_2004'] == 0]
pre_den = pre_treatment[pre_treatment['Country'] == 'Denmark']['Cases'].mean()
pre_nor = pre_treatment[pre_treatment['Country'] == 'Norway']['Cases'].mean()
print(f"Denmark average (1999-2003): {pre_den:.2f}")
print(f"Norway average (1999-2003): {pre_nor:.2f}")
print(f"Difference: {pre_nor - pre_den:.2f}")

print(f"\nPOST-TREATMENT COMPARISON (2004-2008):")
print("="*60)
post_treatment = df_long[df_long['Post_2004'] == 1]
post_den = post_treatment[post_treatment['Country'] == 'Denmark']['Cases'].mean()
post_nor = post_treatment[post_treatment['Country'] == 'Norway']['Cases'].mean()
print(f"Denmark average (2004-2008): {post_den:.2f}")
print(f"Norway average (2004-2008): {post_nor:.2f}")
print(f"Difference: {post_nor - post_den:.2f}")


DIFFERENCE-IN-DIFFERENCES RESULTS
                            OLS Regression Results                            
Dep. Variable:                  Cases   R-squared:                       0.309
Model:                            OLS   Adj. R-squared:                  0.180
Method:                 Least Squares   F-statistic:                     2.387
Date:                Tue, 14 Oct 2025   Prob (F-statistic):              0.107
Time:                        20:51:44   Log-Likelihood:                -165.24
No. Observations:                  20   AIC:                             338.5
Df Residuals:                      16   BIC:                             342.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.604

In [17]:

df_royk_no = pd.read_excel('røykere_filtered_no.xlsx')
df_royk_den = pd.read_excel('røykere_filtered_den.xlsx')

# Sum smoking data for both countries
df_royk_no_summed = df_royk_no.select_dtypes(include=[np.number]).sum()
df_royk_den_summed = df_royk_den.select_dtypes(include=[np.number]).sum()

print("Norway smoking summed data:")
print(df_royk_no_summed)
print("\nDenmark smoking summed data:")
print(df_royk_den_summed)

# Create DataFrames for smoking data
df_smokers_no_total = pd.DataFrame([df_royk_no_summed]).reset_index(drop=True)
df_smokers_no_total.insert(0, 'Country', 'Norway')

df_smokers_den_total = pd.DataFrame([df_royk_den_summed]).reset_index(drop=True)
df_smokers_den_total.insert(0, 'Country', 'Denmark')

# Force column names to be strings
df_smokers_no_total.columns = ['Country'] + [str(col) for col in df_smokers_no_total.columns[1:]]
df_smokers_den_total.columns = ['Country'] + [str(col) for col in df_smokers_den_total.columns[1:]]

# Find common years for smoking data
no_smoking_cols = set(df_smokers_no_total.columns) - {'Country'}
den_smoking_cols = set(df_smokers_den_total.columns) - {'Country'}
common_smoking_cols = sorted(no_smoking_cols.intersection(den_smoking_cols))

# Align smoking data
df_smokers_no_aligned = df_smokers_no_total[['Country'] + common_smoking_cols]
df_smokers_den_aligned = df_smokers_den_total[['Country'] + common_smoking_cols]

# Combine smoking data
df_smoking_combined = pd.concat([df_smokers_den_aligned, df_smokers_no_aligned], ignore_index=True)
print("\nCombined smoking data:")
print(df_smoking_combined)

# Convert to long format
df_smoking_long = df_smoking_combined.melt(id_vars=['Country'], var_name='Year', value_name='Smokers')
df_smoking_long = df_smoking_long.dropna()
df_smoking_long['Year'] = pd.to_numeric(df_smoking_long['Year'])

print("\nSmoking data in long format:")
print(df_smoking_long.head(10))

# Merge hospital cases with smoking data
df_merged = pd.merge(df_long, df_smoking_long, on=['Country', 'Year'], how='inner')
print("\nMerged hospital cases and smoking data:")
print(df_merged.head(15))

# Run DiD regression with smoking controls
# Model: Cases = α + β1*Norway + β2*Post2004 + β3*Treated + β4*Smokers + ε
X_controlled = df_merged[['Norway', 'Post_2004', 'Treated', 'Smokers']].copy()
X_controlled = sm.add_constant(X_controlled)
y_controlled = df_merged['Cases']


Norway smoking summed data:
1999    5104
2000    5208
2001    5216
2002    5485
2003    5288
2004    5243
2005    5225
2006    4968
2007    4997
2008    4637
dtype: int64

Denmark smoking summed data:
1999    2149
2000    2231
2001    1690
2002    1732
2003    1608
2004    1325
2005    1122
2006    1032
2007     908
2008     849
dtype: int64

Combined smoking data:
   Country  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008
0  Denmark  2149  2231  1690  1732  1608  1325  1122  1032   908   849
1   Norway  5104  5208  5216  5485  5288  5243  5225  4968  4997  4637

Smoking data in long format:
   Country  Year  Smokers
0  Denmark  1999     2149
1   Norway  1999     5104
2  Denmark  2000     2231
3   Norway  2000     5208
4  Denmark  2001     1690
5   Norway  2001     5216
6  Denmark  2002     1732
7   Norway  2002     5485
8  Denmark  2003     1608
9   Norway  2003     5288

Merged hospital cases and smoking data:
    Country  Year  Cases  Post_2004  Norway  Treated  Smokers
