In [23]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt




In [24]:
data_sp = {
    '1978': 41.0,
    '1987': 39.5,
    '1993': 37.0,
    '1997': 36.0,
    '2001': 34.5,
    '2006': 29.5,
    '2009': 27.0,
    '2011': 24.5,
    '2014': 23.0,
    '2017': 21.5,
    '2020': 18.0
}

In [25]:
df_no = pd.read_excel("data_washed/norway_smokers.xlsx").drop(columns=["Unnamed: 0"])
df_no = df_no.rename(columns={"Unnamed: 2": "Year", "Unnamed: 3": "smokers_percentage"})
df_no["Year"] = pd.to_numeric(df_no["Year"], errors="coerce")
df_no = df_no.dropna(subset=["Year", "smokers_percentage"]).reset_index(drop=True)
df_no["Year"] = df_no["Year"].astype(int)

In [26]:
df_sp = pd.Series(data_sp, name='smokers_percentage').reset_index().rename(columns={'index': 'Year'})
df_sp['Year'] = pd.to_numeric(df_sp['Year'], errors='coerce').astype(int)

# --- FIX: merge Norway and Spain by the common 'Year' column (was merging df_no with itself on "year") ---
df = df_no.merge(df_sp, on='Year', how='inner', suffixes=('_no', '_sp'))
# If you want a long panel instead, skip the merge and create long format later (you already do that below)
# df_long = pd.concat([df_sp_long, df_no_long], ignore_index=True)
df = df.head(6)
# ...existing code...
df

Unnamed: 0,Year,smokers_percentage_no,smokers_percentage_sp
0,1978,38.0,41.0
1,1987,37.0,39.5
2,1993,36.0,37.0
3,1997,34.0,36.0
4,2001,29.0,34.5
5,2006,23.0,29.5


In [27]:
df = df.sort_values('Year').reset_index(drop=True)

# Create a complete range of years from min to max
full_years = pd.DataFrame({'Year': range(df['Year'].min(), df['Year'].max() + 1)})
df = full_years.merge(df, on='Year', how='left')

df['smokers_percentage_no'] = df['smokers_percentage_no'].interpolate(method='linear')
df['smokers_percentage_sp'] = df['smokers_percentage_sp'].interpolate(method='linear')

# Show first few rows
df = df.head(-2)
df

Unnamed: 0,Year,smokers_percentage_no,smokers_percentage_sp
0,1978,38.0,41.0
1,1979,37.888889,40.833333
2,1980,37.777778,40.666667
3,1981,37.666667,40.5
4,1982,37.555556,40.333333
5,1983,37.444444,40.166667
6,1984,37.333333,40.0
7,1985,37.222222,39.833333
8,1986,37.111111,39.666667
9,1987,37.0,39.5


In [28]:
import pandas as pd
import statsmodels.formula.api as smf

# Reshape to long format
df_long = df.melt(id_vars='Year', value_vars=['smokers_percentage_no', 'smokers_percentage_sp'],
                  var_name='Country', value_name='Smokers')
df_long['Norway'] = (df_long['Country'] == 'smokers_percentage_no').astype(int)
df_long['Year_ct'] = df_long['Year'] - df_long['Year'].mean()  # Center year for stability

# Regression: test if the slope is different between groups
model = smf.ols('Smokers ~ Year_ct + Norway + Year_ct:Norway', data=df_long).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                Smokers   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.865
Method:                 Least Squares   F-statistic:                     114.3
Date:                Thu, 30 Oct 2025   Prob (F-statistic):           2.23e-22
Time:                        12:56:28   Log-Likelihood:                -88.859
No. Observations:                  54   AIC:                             185.7
Df Residuals:                      50   BIC:                             193.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         37.5556      0.251    149.