In [1]:
import pandas as pd
import numpy as np
import os
from functools import reduce

In [None]:
# Overall steps:

# 1. Get all tables into the same format (Rows: Area, Columns: Feature by year 2010-2023)

# 2. Limit to ~10 most usable categories at first
# 3. Drop Åland, impute missing years/values
# 4. Apply simple ML algorithm (linear regression) to predict average age based on factors like:
# 5. Age, sex, daily smoking, binge drinking, obesity, physical activity, happiness, disposable income, access to healthcare, mental strain, etc.

### Data Loading

In [2]:
data_dir = "regional_data"
subdirs = ["thl", "tilastokeskus"]

#### Life Expectancy

In [3]:
# Load the data
life_expectancy = pd.read_csv(data_dir + '/thl/life_expectancy.csv', sep=';', index_col=False)

# columns value and value2 are always equal
life_expectancy = life_expectancy.drop(columns=['indicator', 'indicator_id', 'region_id', 'value2'])
life_expectancy_by_region_and_sex = life_expectancy.groupby(['region', 'sex'])['value'].mean().reset_index()

# Male, female and combined life expectancy per region
print(life_expectancy_by_region_and_sex.head(15))

                  region       sex      value
0        Central Finland  combined  80.900000
1        Central Finland    female  83.857143
2        Central Finland      male  77.978571
3   Central Ostrobothnia  combined  81.785714
4   Central Ostrobothnia    female  84.428571
5   Central Ostrobothnia      male  79.128571
6                 Kainuu  combined  79.764286
7                 Kainuu    female  83.321429
8                 Kainuu      male  76.478571
9             Kanta-Häme  combined  81.035714
10            Kanta-Häme    female  83.935714
11            Kanta-Häme      male  78.121429
12           Kymenlaakso  combined  79.950000
13           Kymenlaakso    female  83.257143
14           Kymenlaakso      male  76.750000


In [4]:
# List for storing the dataframes and the feature columns
frames = []
features = []

for subdir in os.listdir(data_dir):
    if subdir in subdirs:
        path = data_dir + "/" + subdir
        
        for file in os.listdir(path):
            filename = os.fsdecode(file)
            filepath = path + "/" + filename
            print(f"Processing file: {filename} from: {path}")
            df = pd.read_csv(filepath, sep=';', index_col=False)
            
            if subdir == "thl":
                # Get the name of the measured feature
                name = filename.split(".")[0]
                features.append(name)

                # Get a brief description
                description = df["indicator"][0]

                # Drop unnecessary columns
                df = df.drop(columns=['indicator', 'indicator_id', 'region_id', 'sex'])

                # Rename value column to the name of what was measured
                df = df.rename(columns={"value": name})

                # Group the data by region
                df_mean_by_region = df.groupby(['region'])[name].mean().reset_index()

                frames.append(df_mean_by_region)

                print(f"Data description: {description}\n")
            print(f"{df.head()}\n")
            print(f"================================\n")

# Combine all the dataframes
df = reduce(lambda x, y: pd.merge(x, y, on = 'region'), frames)

Processing file: alcohol_sales.csv from: regional_data/thl
Data description: Sale of alcoholic beverages per capita, as litres of pure alcohol

            region  year  alcohol_sales
0  Central Finland  2010            8.1
1  Central Finland  2011            8.1
2  Central Finland  2012            7.7
3  Central Finland  2013            7.5
4  Central Finland  2014            7.3


Processing file: binge_drinking.csv from: regional_data/thl
Data description: Persons who engage in binge drinking (AUDIT-1k) (%)

            region  year  binge_drinking
0  Central Finland  2013            11.5
1  Central Finland  2014            12.1
2  Central Finland  2015            11.8
3  Central Finland  2018             9.5
4  Central Finland  2020            10.1


Processing file: culture_promotion.csv from: regional_data/thl
Data description: Promotion of culture in municipalities - TEA, score

                 region  year  culture_promotion
0       Central Finland  2019               56.0
1  

  df = pd.read_csv(filepath, sep=';', index_col=False)


In [5]:
df

Unnamed: 0,region,alcohol_sales,binge_drinking,culture_promotion,daily_smokers,disability_ratio,elder_care_per_100k,fees_hampered_care,health_workers_per_10k,incidence_disability_pension,insufficient_medical_services,life_expectancy,mental_health,obesity_rate,overcrowded_living,percentage_happy,physical_activity,regular_sports_events,severe_mental_strain,work_until_retired
0,Central Finland,7.133333,10.242857,62.0,10.742857,7.933333,2675.5,28.05,77.091667,5.1,20.55,80.911905,128.67,21.028571,7.82,52.771429,22.366667,71.5,12.357143,26.985714
1,Central Ostrobothnia,6.373333,8.828571,68.333333,13.128571,7.96,3075.855556,27.625,108.225,5.286667,18.15,81.780952,119.53,22.442857,8.766667,51.042857,24.225,64.3,11.971429,26.542857
2,Kainuu,8.12,10.128571,63.0,13.242857,11.1,4083.455556,34.875,95.68125,6.96,24.975,79.854762,122.46,22.942857,6.66,51.857143,23.55,74.0,12.542857,31.971429
3,Kanta-Häme,7.406667,8.628571,61.333333,13.171429,7.4,2448.977778,27.325,69.025,5.28,17.025,81.030952,100.18,23.842857,7.833333,51.557143,27.05,79.4,13.485714,28.928571
4,Kymenlaakso,7.733333,10.957143,73.0,14.185714,9.153333,2843.455556,30.825,69.860417,6.013333,20.525,79.985714,108.85,22.8,6.26,49.585714,27.133333,96.7,12.971429,31.685714
5,Lapland,10.006667,9.957143,67.666667,15.028571,9.886667,3314.666667,31.275,89.920833,6.613333,20.025,80.411905,121.46,22.442857,7.606667,52.057143,22.7,64.0,12.871429,29.728571
6,North Karelia,7.04,8.728571,54.666667,12.557143,9.64,3244.611111,32.95,86.38125,5.693333,27.225,80.183333,137.69,22.471429,7.78,50.757143,24.9,78.6,13.285714,30.571429
7,North Ostrobothnia,6.62,10.8,67.0,11.928571,9.026667,2861.066667,29.325,98.4875,6.34,21.5,81.297619,138.98,19.657143,8.56,51.2,22.0,69.1,12.971429,25.714286
8,North Savo,7.6,9.928571,68.666667,12.528571,10.566667,3094.122222,27.875,110.533333,6.926667,18.45,80.328571,155.04,22.3,7.153333,51.071429,23.566667,79.9,12.585714,29.442857
9,Ostrobothnia,5.446667,8.142857,56.0,9.842857,6.066667,2317.266667,26.15,94.26875,4.113333,21.525,82.771429,86.86,20.8,8.38,45.871429,22.833333,66.3,12.971429,21.842857


### Linear correlations

In [6]:
target_variable = "life_expectancy"

corr = df.corr(numeric_only=True)[target_variable].sort_values(ascending=False)
print(corr)

life_expectancy                  1.000000
overcrowded_living               0.732492
health_workers_per_10k           0.260693
severe_mental_strain             0.174415
physical_activity               -0.050711
culture_promotion               -0.197724
obesity_rate                    -0.276744
regular_sports_events           -0.308571
insufficient_medical_services   -0.324062
binge_drinking                  -0.372236
mental_health                   -0.398135
elder_care_per_100k             -0.530975
percentage_happy                -0.577439
incidence_disability_pension    -0.644652
fees_hampered_care              -0.659537
daily_smokers                   -0.667430
disability_ratio                -0.725938
alcohol_sales                   -0.754990
work_until_retired              -0.801696
Name: life_expectancy, dtype: float64


### Input features and target, Train-test split

In [37]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

y = df[target_variable]
X = df[features].drop(columns=[target_variable])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
X

Unnamed: 0,alcohol_sales,binge_drinking,culture_promotion,daily_smokers,disability_ratio,elder_care_per_100k,fees_hampered_care,health_workers_per_10k,incidence_disability_pension,insufficient_medical_services,mental_health,obesity_rate,overcrowded_living,percentage_happy,physical_activity,regular_sports_events,severe_mental_strain,work_until_retired
0,7.133333,10.242857,62.0,10.742857,7.933333,2675.5,28.05,77.091667,5.1,20.55,128.67,21.028571,7.82,52.771429,22.366667,71.5,12.357143,26.985714
1,6.373333,8.828571,68.333333,13.128571,7.96,3075.855556,27.625,108.225,5.286667,18.15,119.53,22.442857,8.766667,51.042857,24.225,64.3,11.971429,26.542857
2,8.12,10.128571,63.0,13.242857,11.1,4083.455556,34.875,95.68125,6.96,24.975,122.46,22.942857,6.66,51.857143,23.55,74.0,12.542857,31.971429
3,7.406667,8.628571,61.333333,13.171429,7.4,2448.977778,27.325,69.025,5.28,17.025,100.18,23.842857,7.833333,51.557143,27.05,79.4,13.485714,28.928571
4,7.733333,10.957143,73.0,14.185714,9.153333,2843.455556,30.825,69.860417,6.013333,20.525,108.85,22.8,6.26,49.585714,27.133333,96.7,12.971429,31.685714
5,10.006667,9.957143,67.666667,15.028571,9.886667,3314.666667,31.275,89.920833,6.613333,20.025,121.46,22.442857,7.606667,52.057143,22.7,64.0,12.871429,29.728571
6,7.04,8.728571,54.666667,12.557143,9.64,3244.611111,32.95,86.38125,5.693333,27.225,137.69,22.471429,7.78,50.757143,24.9,78.6,13.285714,30.571429
7,6.62,10.8,67.0,11.928571,9.026667,2861.066667,29.325,98.4875,6.34,21.5,138.98,19.657143,8.56,51.2,22.0,69.1,12.971429,25.714286
8,7.6,9.928571,68.666667,12.528571,10.566667,3094.122222,27.875,110.533333,6.926667,18.45,155.04,22.3,7.153333,51.071429,23.566667,79.9,12.585714,29.442857
9,5.446667,8.142857,56.0,9.842857,6.066667,2317.266667,26.15,94.26875,4.113333,21.525,86.86,20.8,8.38,45.871429,22.833333,66.3,12.971429,21.842857


### Testing Random Forest Regression

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))

# Test diagnostics
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


daily_smokers                    0.260431
overcrowded_living               0.194921
work_until_retired               0.115179
alcohol_sales                    0.088044
fees_hampered_care               0.059095
disability_ratio                 0.052974
elder_care_per_100k              0.043013
insufficient_medical_services    0.042656
percentage_happy                 0.031575
culture_promotion                0.027660
incidence_disability_pension     0.018444
binge_drinking                   0.015904
mental_health                    0.015331
regular_sports_events            0.011683
health_workers_per_10k           0.011425
physical_activity                0.004888
severe_mental_strain             0.004769
obesity_rate                     0.002008
dtype: float64
MAE: 0.2698035714285787
R²: 0.5311258876022363


### Testing a simple neural network

In [41]:
# Hyperparameters
input_dim = X_train.shape[1] # number of neurons in the input layer
n_neurons = 32      # number of neurons in the first hidden layer
epochs = 150 

# Input layer
model = keras.Sequential([
      layers.Dense(n_neurons, input_dim=input_dim, kernel_initializer='normal', activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1, kernel_initializer='normal')
  ])

# Compile and train the model
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))
model.fit(X_train, y_train, epochs=epochs, verbose=0)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x295addc4a00>

### Testing predictions

In [48]:
test_data = pd.DataFrame([{
    "alcohol_sales": 5,
    "binge_drinking": 9,
    "culture_promotion": 62,
    "daily_smokers": 15,
    "disability_ratio": 10,
    "elder_care_per_100k": 2675,
    "fees_hampered_care": 28,
    "health_workers_per_10k": 110,
    "incidence_disability_pension": 5,
    "insufficient_medical_services": 18,
    "mental_health": 128,
    "obesity_rate": 22,
    "overcrowded_living": 7,
    "percentage_happy": 60,
    "physical_activity": 22,
    "regular_sports_events": 75,
    "severe_mental_strain": 12,
    "work_until_retired": 27,
}])

test_prediction = model.predict(test_data)
test_prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


array([[76.89434]], dtype=float32)