In [108]:
import pandas as pd
import statsmodels.api as sm

In [109]:
data_no= {
    2000: 44.9,
    2001: 42.84, # Interpolated
    2002: 40.78, # Interpolated
    2003: 38.72, # Interpolated
    2001: 36.66, # Interpolated
    2005: 34.6,
    2006: 32.9, # Interpolated
    2007: 31.2,
    2008: 29.67, # Interpolated
    2009: 28.13, # Interpolated
    2010: 26.6,
    # 2011: 25.38, # Interpolated
    # 2012: 24.16, # Interpolated
    # 2013: 22.94, # Interpolated
    # 2014: 21.72, # Interpolated
    # 2015: 20.5,
    # 2016: 19.37, # Interpolated
    # 2017: 18.23, # Interpolated
    # 2018: 17.1,
    
}

In [110]:
# data_aus = {
#     2000: 52.5,
#     2001: 50.88, # Interpolated
#     2002: 49.26, # Interpolated
#     2003: 47.64, # Interpolated
#     2004: 46.02, # Interpolated
#     2005: 44.4,
#     2006: 42.85, # Interpolated
#     2007: 41.3,
#     2008: 39.93, # Interpolated
#     2009: 38.57, # Interpolated
#     2010: 37.2,
#     2011: 36.06, # Interpolated
#     2012: 34.92, # Interpolated
#     2013: 33.78, # Interpolated
#     2014: 32.64, # Interpolated
#     2015: 31.5,
#     2016: 30.03, # Interpolated
#     2017: 28.57, # Interpolated
#     2018: 27.1,
    
# }

In [111]:
data_aus = {
    2000: 35.0,
    2001: 34.66, # Interpolated
    2002: 34.32, # Interpolated
    2003: 33.98, # Interpolated
    2001: 33.64, # Interpolated
    2005: 33.3,
    2006: 33.05, # Interpolated
    2007: 32.8,
    2008: 32.43, # Interpolated
    2009: 32.07, # Interpolated
    2010: 31.7,
    # 2011: 31.42, # Interpolated
    # 2012: 31.14, # Interpolated
    # 2013: 30.86, # Interpolated
    # 2014: 30.58, # Interpolated
    # 2015: 30.3,
    # 2016: 29.57, # Interpolated
    # 2017: 28.83, # Interpolated
    # 2018: 28.1,
    # 2019: 28.1,
    # 2020: 28.9,
    # 2021: 28.7,
    # 2022: 28.4
}

In [112]:
df_no = pd.Series(data_no).rename_axis('Year').reset_index(name='smokers_percentage')
df_aus = pd.Series(data_aus).rename_axis('Year').reset_index(name='smokers_percentage')

# ensure numeric types
df_no['Year'] = pd.to_numeric(df_no['Year'])
df_no['smokers_percentage'] = pd.to_numeric(df_no['smokers_percentage'])

df_aus['Year'] = pd.to_numeric(df_aus['Year'])
df_aus['smokers_percentage'] = pd.to_numeric(df_aus['smokers_percentage'])

In [113]:
# ...existing code...
# merge Norway and Australia series into one tidy dataframe
df = pd.merge(df_no, df_aus, on='Year', how='inner', suffixes=('_no', '_aus'))

# ensure numeric and drop rows with missing values
df['smokers_percentage_no'] = pd.to_numeric(df['smokers_percentage_no'], errors='coerce')
df['smokers_percentage_aus'] = pd.to_numeric(df['smokers_percentage_aus'], errors='coerce')
df = df.dropna(subset=['Year', 'smokers_percentage_no', 'smokers_percentage_aus']).reset_index(drop=True)

df
# ...existing code...

Unnamed: 0,Year,smokers_percentage_no,smokers_percentage_aus
0,2000,44.9,35.0
1,2001,36.66,33.64
2,2002,40.78,34.32
3,2003,38.72,33.98
4,2005,34.6,33.3
5,2006,32.9,33.05
6,2007,31.2,32.8
7,2008,29.67,32.43
8,2009,28.13,32.07
9,2010,26.6,31.7


In [114]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [117]:
# ...existing code...
# Build long panel and run DiD (Norway treated in 2004, Australia control)
df_no_long = df[['Year', 'smokers_percentage_no']].rename(
    columns={'smokers_percentage_no': 'smokers_percentage'}
).assign(Country='Norway')

df_aus_long = df[['Year', 'smokers_percentage_aus']].rename(
    columns={'smokers_percentage_aus': 'smokers_percentage'}
).assign(Country='Australia')

df_long = pd.concat([df_no_long, df_aus_long], ignore_index=True)

# clean types and drop missing
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')
df_long['smokers_percentage'] = pd.to_numeric(df_long['smokers_percentage'], errors='coerce')
df_long = df_long.dropna(subset=['Year', 'smokers_percentage']).reset_index(drop=True)

# DiD variables
df_long['Norway'] = (df_long['Country'] == 'Norway').astype(int)
df_long['Post2004'] = (df_long['Year'] >= 2004).astype(int)
df_long['Treated'] = df_long['Norway'] * df_long['Post2004']

# OLS DiD (robust SE)
X = df_long[['Norway', 'Post2004', 'Treated']]
X = sm.add_constant(X)
y = df_long['smokers_percentage']

model = sm.OLS(y, X).fit(cov_type='HC1')
print(model.summary())

did_coef = model.params['Treated']
did_p = model.pvalues['Treated']
print(f"\nDiD estimate (Treated): {did_coef:.4f}, p-value: {did_p:.4f}")
# ...existing code...

                            OLS Regression Results                            
Dep. Variable:     smokers_percentage   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.692
Method:                 Least Squares   F-statistic:                     13.79
Date:                Wed, 15 Oct 2025   Prob (F-statistic):           0.000106
Time:                        14:46:37   Log-Likelihood:                -42.796
No. Observations:                  20   AIC:                             93.59
Df Residuals:                      16   BIC:                             97.57
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         34.2350      0.281    121.785      0.0