In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [85]:
# Read Excel file and save as DataFrame

df = pd.read_excel('./Datasets/tobacco_data.xlsx')
df.columns = df.iloc[0]
df = df[1:]
df.head()

Unnamed: 0,Intervention_descriptor,tax_increase,outlet_reduction,dec_smoking_prevalence,dec_tobacco_supply,dec_smoking_uptake,age,gender,ethnicity,discount_rate,evidence_strength,qalys_pc,hs_costs_pc
1,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,0-14,Male,non-Māori,0,,40.865526,-1284765.096725
2,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,15-24,Male,non-Māori,0,,41.708939,-1270055.987675
3,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,25-44,Male,non-Māori,0,,13.282615,-318700.524314
4,Combined tobacco endgame strategy (tobacco-fre...,10,90,1.0,0,0,45-64,Male,non-Māori,0,,7.222291,-119003.652181
5,Combined tobacco endgame strategy (tobacco-fre...,10,90,0.5,0,0,65+,Male,non-Māori,0,,1.111505,-9656.694651


In [86]:
# Transform data

# Map age group to integer
avg_age_mapping = {
    '0-14': 7,
    '15-24': 20,
    '25-44': 33,
    '45-64': 55,
    '65+': 75
}

# Map gender to integer
gender_mapping = {
    'Male': 0,
    'Female': 1
}

# Map ethnicity to integer
ethnicity_mapping = {
    'Māori': 0,
    'non-Māori': 1
}

# Apply the mapping to the 'Age_Group' column
df['average_age'] = df['age'].map(avg_age_mapping)
df['gender_idx'] = df['gender'].map(gender_mapping)
df['ethnicity_idx'] = df['ethnicity'].map(ethnicity_mapping)

# Impute missing values in 'average_age' with the mean
df['average_age'] = df['average_age'].fillna(df['average_age'].mean())

# Impute missing values in 'gender_idx' and 'ethnicity_idx' with the mode
df['gender_idx'] = df['gender_idx'].fillna(df['gender_idx'].mode()[0])
df['ethnicity_idx'] = df['ethnicity_idx'].fillna(df['ethnicity_idx'].mode()[0])

# Convert the specified columns to floats
df[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
    'dec_tobacco_supply', 'dec_smoking_uptake', 'qalys_pc']] = df[['tax_increase', 'outlet_reduction', 
    'dec_smoking_prevalence', 'dec_tobacco_supply', 'dec_smoking_uptake', 'qalys_pc']].apply(pd.to_numeric, errors='coerce').astype('float')

# Columns to be used for model building
df_vape = df[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
              'dec_tobacco_supply', 'dec_smoking_uptake', 'average_age', 
              'gender_idx', 'ethnicity_idx', 'qalys_pc']]

# Display updated DataFrame
df_vape.head()

Unnamed: 0,tax_increase,outlet_reduction,dec_smoking_prevalence,dec_tobacco_supply,dec_smoking_uptake,average_age,gender_idx,ethnicity_idx,qalys_pc
1,10.0,90.0,7.0,0.0,0.0,7.0,0.0,1.0,40.865526
2,10.0,90.0,7.0,0.0,0.0,20.0,0.0,1.0,41.708939
3,10.0,90.0,7.0,0.0,0.0,33.0,0.0,1.0,13.282615
4,10.0,90.0,1.0,0.0,0.0,55.0,0.0,1.0,7.222291
5,10.0,90.0,0.5,0.0,0.0,75.0,0.0,1.0,1.111505


In [87]:
def display_pca(df):
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_vape)
    
    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # Check the explained variance ratio for the first two components
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained variance by component 1: {explained_variance[0]:.2f}")
    print(f"Explained variance by component 2: {explained_variance[1]:.2f}")
    
    # Create a scatter plot of the PCA results
    plt.figure(figsize=(8, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c='blue', edgecolor='k', s=50)
    
    # Add labels and title
    plt.xlabel(f"Principal Component 1 ({explained_variance[0]:.2f} variance)")
    plt.ylabel(f"Principal Component 2 ({explained_variance[1]:.2f} variance)")
    plt.title("PCA of Dataset")
    plt.grid(True)
    
    # Show plot
    plt.show()

In [88]:
X = df_vape[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
              'dec_tobacco_supply', 'dec_smoking_uptake', 'average_age', 
              'gender_idx', 'ethnicity_idx']]
y = df_vape[['qalys_pc']]

In [89]:
# Function to generate synthetic samples
def generate_synthetic_samples(X, y, n_samples):
    nn = NearestNeighbors(n_neighbors=5)
    nn.fit(X)

    synthetic_X = []
    synthetic_y = []
    for _ in range(n_samples):
        idx = np.random.randint(0, len(X))
        neighbors = nn.kneighbors([X[idx]], return_distance=False)[0]
        
        neighbor_idx = np.random.choice(neighbors)
        lam = np.random.uniform(0, 1)
        
        # Generate synthetic sample using interpolation
        new_sample_X = X[idx] + lam * (X[neighbor_idx] - X[idx])
        new_sample_y = y[idx] + lam * (y[neighbor_idx] - y[idx])
        
        synthetic_X.append(new_sample_X)
        synthetic_y.append(new_sample_y)
    
    return np.array(synthetic_X), np.array(synthetic_y)

# Ensure that y is a 1D array for compatibility
y_train_flat = y_train.values.flatten()

# Generate synthetic samples
X_train_res, y_train_res = generate_synthetic_samples(X_train.values, y_train_flat, n_samples=200)

# Stack the original and synthetic data
X_train_full = np.vstack([X_train.values, X_train_res])
y_train_full = np.hstack([y_train_flat, y_train_res])

# Convert to DataFrame for easier handling
df_resampled = pd.DataFrame(X_train_full, columns=X_train.columns)
df_resampled['qalys_pc'] = y_train_full

# Display the resampled dataframe
df_resampled

Unnamed: 0,tax_increase,outlet_reduction,dec_smoking_prevalence,dec_tobacco_supply,dec_smoking_uptake,average_age,gender_idx,ethnicity_idx,qalys_pc
0,10.000000,0.000000,4.504993,0.0,9.629837,26.521638,0.501664,0.498336,26.896993
1,10.000000,90.000000,7.000000,0.0,0.000000,9.695313,1.000000,1.000000,35.885192
2,20.000000,95.000000,3.300000,0.0,0.000000,33.000000,0.000000,1.000000,21.178164
3,9.155406,90.810810,0.983108,0.0,0.147804,55.000000,0.000000,1.000000,6.842780
4,11.383916,90.691958,2.179909,0.0,0.000000,55.000000,0.000000,0.138392,15.311533
...,...,...,...,...,...,...,...,...,...
369,10.000000,90.000000,12.154066,0.0,0.000000,20.000000,0.468551,0.531449,94.877571
370,11.762335,90.881167,2.797582,0.0,0.000000,75.000000,0.000000,0.000000,1.881320
371,10.000000,0.000000,1.947280,0.0,3.283180,56.054408,0.052720,0.000000,12.229829
372,10.000000,0.000000,3.052111,0.0,14.032724,20.225814,0.000000,0.982630,14.447476


In [90]:
X = df_resampled[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
              'dec_tobacco_supply', 'dec_smoking_uptake', 'average_age', 
              'gender_idx', 'ethnicity_idx']]
y = df_resampled[['qalys_pc']]

# Split the data into train and test sets (optional)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [94]:
# Define the RandomForestRegressor model with bootstrap disabled
rf_model_no_bootstrap = RandomForestRegressor(random_state=42, bootstrap=False)

# Define the parameter grid to search over
param_grid_no_bootstrap = {
    'n_estimators': [100, 200, 300],    # Number of trees in the forest
    'max_depth': [3, 5, 10],            # Maximum depth of the tree
    'min_samples_leaf': [1, 2, 4],      # Minimum number of samples required to be at a leaf node
}

# Define the MAPE scorer (using Mean Absolute Percentage Error)
mape_scorer = make_scorer(mape, greater_is_better=False)

# Setup GridSearchCV to perform cross-validation
grid_search_rf_no_bootstrap = GridSearchCV(estimator=rf_model_no_bootstrap, param_grid=param_grid_no_bootstrap, 
                                           scoring=mape_scorer, cv=5, verbose=1, n_jobs=-1)

# Fit the grid search to the duplicated training data
grid_search_rf_no_bootstrap.fit(X_train, y_train)

# Best hyperparameters from grid search
print("Best Parameters for Random Forest (No Bootstrap):", grid_search_rf_no_bootstrap.best_params_)

# Best MAPE score from cross-validation
print("Best MAPE for Random Forest (No Bootstrap):", -grid_search_rf_no_bootstrap.best_score_)

# Train a final model using the best parameters
best_rf_model_no_bootstrap = grid_search_rf_no_bootstrap.best_estimator_

# Evaluate on the test set
y_pred_rf_no_bootstrap = best_rf_model_no_bootstrap.predict(X_test)

# Calculate the test MAPE
test_mape_rf_no_bootstrap = mape(y_test, y_pred_rf_no_bootstrap)
print("Test MAPE for Random Forest (No Bootstrap):", test_mape_rf_no_bootstrap)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


  return fit_method(estimator, *args, **kwargs)


Best Parameters for Random Forest (No Bootstrap): {'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 200}
Best MAPE for Random Forest (No Bootstrap): 0.19283427063816608
Test MAPE for Random Forest (No Bootstrap): 0.22185960119448372


In [95]:
# Define the XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],   # Number of trees
    'max_depth': [5, 10, 20],            # Depth of the trees
    'min_child_weight': [1, 5, 10],     # Minimum sum of instance weight (hessian)
    'reg_lambda': [0.01, 0.1, 1, 10],  # L2 regularization term (lambda)
    'reg_alpha': [0.01, 0.1, 1, 10],      # L1 regularization term (alpha)
}

# Define the MAPE scorer (as we are optimizing based on Mean Absolute Percentage Error)
mape_scorer = make_scorer(mape, greater_is_better=False)

# Setup GridSearchCV to perform cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring=mape_scorer, cv=5, verbose=1, n_jobs=-1)

# Fit the grid search to the duplicated training data
grid_search.fit(X_train, y_train)

# Best hyperparameters from grid search
print("Best Parameters:", grid_search.best_params_)

# Best MAPE score from cross-validation
print("Best MAPE:", -grid_search.best_score_)

# Train a final model using the best parameters
best_xgb_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_xgb_model.predict(X_test)

# Calculate the test MAPE
test_mape = mape(y_test, y_pred)
print("Test MAPE:", test_mape)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Parameters: {'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.01, 'reg_lambda': 1}
Best MAPE: 0.13512638298595708
Test MAPE: 0.31217081811742936


In [96]:
# Define the RandomForestRegressor model
rf_model = RandomForestRegressor(random_state=42, bootstrap=True)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],    # Number of trees in the forest
    'max_depth': [5, 10, 20],            # Maximum depth of the tree
    'min_samples_leaf': [1, 5, 10],      # Minimum number of samples required to be at a leaf node
    'max_samples': [0.5, 0.7, 1.0],     # Maximum number of samples to draw from the data with replacement
}

# Define the MAPE scorer (using Mean Absolute Percentage Error)
mape_scorer = make_scorer(mape, greater_is_better=False)

# Setup GridSearchCV to perform cross-validation
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                              scoring=mape_scorer, cv=5, verbose=1, n_jobs=-1)

# Fit the grid search to the duplicated training data
grid_search_rf.fit(X_train, y_train)

# Best hyperparameters from grid search
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)

# Best MAPE score from cross-validation
print("Best MAPE for Random Forest:", -grid_search_rf.best_score_)

# Train a final model using the best parameters
best_rf_model = grid_search_rf.best_estimator_

# Evaluate on the test set
y_pred_rf = best_rf_model.predict(X_test)

# Calculate the test MAPE
test_mape_rf = mape(y_test, y_pred_rf)
print("Test MAPE for Random Forest:", test_mape_rf)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


  return fit_method(estimator, *args, **kwargs)


Best Parameters for Random Forest: {'max_depth': 20, 'max_samples': 1.0, 'min_samples_leaf': 1, 'n_estimators': 300}
Best MAPE for Random Forest: 0.2684251263913668
Test MAPE for Random Forest: 0.29079338071442434
