In [66]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [54]:
# Read Excel file and save as DataFrame

df = pd.read_excel('./Datasets/tobacco_data.xlsx')
df.columns = df.iloc[0]
df = df[1:]
df.head()

Unnamed: 0,Intervention_descriptor,tax_increase,outlet_reduction,dec_smoking_prevalence,dec_tobacco_supply,dec_smoking_uptake,age,gender,ethnicity,discount_rate,evidence_strength,qalys_pc,hs_costs_pc
1,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,0-14,Male,non-Māori,0,,40.865526,-1284765.096725
2,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,15-24,Male,non-Māori,0,,41.708939,-1270055.987675
3,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,25-44,Male,non-Māori,0,,13.282615,-318700.524314
4,Combined tobacco endgame strategy (tobacco-fre...,10,90,1.0,0,0,45-64,Male,non-Māori,0,,7.222291,-119003.652181
5,Combined tobacco endgame strategy (tobacco-fre...,10,90,0.5,0,0,65+,Male,non-Māori,0,,1.111505,-9656.694651


In [55]:
# Transform data

# Map age group to integer
age_group_mapping = {
    '0-14': 0,
    '15-24': 1,
    '25-44': 2,
    '45-64': 3,
    '65+': 4
}

# Map gender to integer
gender_mapping = {
    'Male': 0,
    'Female': 1
}

# Map ethnicity to integer
ethnicity_mapping = {
    'Māori': 0,
    'non-Māori': 1
}

# Apply the mapping to the 'Age_Group' column
df['age_group'] = df['age'].map(age_group_mapping)
df['gender_idx'] = df['gender'].map(gender_mapping)
df['ethnicity_idx'] = df['ethnicity'].map(ethnicity_mapping)

# Display the updated DataFrame
df.head()

Unnamed: 0,Intervention_descriptor,tax_increase,outlet_reduction,dec_smoking_prevalence,dec_tobacco_supply,dec_smoking_uptake,age,gender,ethnicity,discount_rate,evidence_strength,qalys_pc,hs_costs_pc,age_group,gender_idx,ethnicity_idx
1,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,0-14,Male,non-Māori,0,,40.865526,-1284765.096725,0.0,0.0,1.0
2,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,15-24,Male,non-Māori,0,,41.708939,-1270055.987675,1.0,0.0,1.0
3,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,25-44,Male,non-Māori,0,,13.282615,-318700.524314,2.0,0.0,1.0
4,Combined tobacco endgame strategy (tobacco-fre...,10,90,1.0,0,0,45-64,Male,non-Māori,0,,7.222291,-119003.652181,3.0,0.0,1.0
5,Combined tobacco endgame strategy (tobacco-fre...,10,90,0.5,0,0,65+,Male,non-Māori,0,,1.111505,-9656.694651,4.0,0.0,1.0


In [59]:
# Define variables
X = np.array(df[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 'dec_tobacco_supply', 'dec_smoking_uptake', 'age_group', 'gender_idx', 'ethnicity_idx']])
y = np.array(df[['qalys_pc','hs_costs_pc']])

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Initialize the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Compute MSE for each output separately
mse_qalys = mean_squared_error(y_test[:, 0], y_pred[:, 0])
mse_costs = mean_squared_error(y_test[:, 1], y_pred[:, 1])

print(f"Mean Squared Error for qalys_pc: {mse_qalys}")
print(f"Mean Squared Error for hs_costs_pc: {mse_costs}")

Mean Squared Error for qalys_pc: 105.73938767541945
Mean Squared Error for hs_costs_pc: 85035742860.28975
