In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_excel('./Datasets/tobacco_data.xlsx')
df.columns = df.iloc[0]
df = df[1:]
df.head()

Unnamed: 0,Intervention_descriptor,tax_increase,outlet_reduction,dec_smoking_prevalence,dec_tobacco_supply,dec_smoking_uptake,age,gender,ethnicity,discount_rate,evidence_strength,qalys_pc,hs_costs_pc
1,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,0-14,Male,non-Māori,0,,40.865526,-1284765.096725
2,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,15-24,Male,non-Māori,0,,41.708939,-1270055.987675
3,Combined tobacco endgame strategy (tobacco-fre...,10,90,7.0,0,0,25-44,Male,non-Māori,0,,13.282615,-318700.524314
4,Combined tobacco endgame strategy (tobacco-fre...,10,90,1.0,0,0,45-64,Male,non-Māori,0,,7.222291,-119003.652181
5,Combined tobacco endgame strategy (tobacco-fre...,10,90,0.5,0,0,65+,Male,non-Māori,0,,1.111505,-9656.694651


In [25]:
non_maori_sp = np.mean(df[df['ethnicity'] == 'non-Māori']['dec_smoking_prevalence'].astype('float'))
maori_sp = np.mean(df[df['ethnicity'] == 'Māori']['dec_smoking_prevalence'].astype('float'))
maori_sp / non_maori_sp

np.float64(2.7297645739910315)

In [27]:
female_sp = np.mean(df[df['gender'] == 'Female']['dec_smoking_prevalence'].astype('float'))
male_sp = np.mean(df[df['gender'] == 'Male']['dec_smoking_prevalence'].astype('float'))
male_sp / female_sp

np.float64(1.0127886881382562)

In [29]:
age_group_1_sp = np.mean(df[df['age'] == '0-14']['dec_smoking_prevalence'].astype('float'))
age_group_2_sp = np.mean(df[df['age'] == '15-24']['dec_smoking_prevalence'].astype('float'))
age_group_3_sp = np.mean(df[df['age'] == '25-44']['dec_smoking_prevalence'].astype('float'))
age_group_4_sp = np.mean(df[df['age'] == '45-64']['dec_smoking_prevalence'].astype('float'))
age_group_5_sp = np.mean(df[df['age'] == '65+']['dec_smoking_prevalence'].astype('float'))

print(age_group_1_sp / age_group_3_sp)
print(age_group_2_sp / age_group_3_sp)
print(age_group_4_sp / age_group_3_sp)
print(age_group_5_sp / age_group_3_sp)

0.9786476868327403
1.0086647067925112
0.5610913404507712
0.3594306049822065


In [7]:
def build_lr(X_train, X_test, y_train, y_test):
    # Initialize the Linear Regression model
    model = LinearRegression()
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    lr_mape = mape(y_test, y_pred)

    print("Test MAPE for Linear Regression Model:", lr_mape)
    
    return model, lr_mape

In [9]:
def prepare_df():
    df = pd.read_excel('./Datasets/tobacco_data_v2.xlsx')
    df.columns = df.iloc[0]
    df = df[1:]
    
    # Map age group to integer
    avg_age_mapping = {
        '0-14': 7,
        '15-24': 20,
        '25-44': 33,
        '45-64': 55,
        '65+': 75
    }
    
    # Map gender to integer
    gender_mapping = {
        'Male': 0,
        'Female': 1
    }

    # Map ethnicity to integer
    ethnicity_mapping = {
        'Māori': 0,
        'non-Māori': 1
    }
    
    # Apply the mapping to the 'Age_Group' column
    df['average_age'] = df['age'].map(avg_age_mapping)
    df['gender_idx'] = df['gender'].map(gender_mapping)
    df['ethnicity_idx'] = df['ethnicity'].map(ethnicity_mapping)
    
    # Impute missing values in 'average_age' with the mean
    df['average_age'] = df['average_age'].fillna(df['average_age'].mean())
    
    # Impute missing values in 'gender_idx' and 'ethnicity_idx' with the mode
    df['gender_idx'] = df['gender_idx'].fillna(df['gender_idx'].mode()[0])
    df['ethnicity_idx'] = df['ethnicity_idx'].fillna(df['ethnicity_idx'].mode()[0])
    
    # Convert the specified columns to floats
    df[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
        'dec_tobacco_supply', 'dec_smoking_uptake', 'qalys_pc']] = df[['tax_increase', 'outlet_reduction', 
        'dec_smoking_prevalence', 'dec_tobacco_supply', 'dec_smoking_uptake', 'qalys_pc']].apply(pd.to_numeric, errors='coerce').astype('float')
    
    # Columns to be used for model building
    df_vape = df[['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
                  'dec_tobacco_supply', 'dec_smoking_uptake', 'average_age', 
                  'gender_idx', 'ethnicity_idx', 'qalys_pc', 'hs_costs_pc']]
    return df_vape

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error as mape

df = prepare_df()
columns = ['tax_increase', 'outlet_reduction', 'dec_smoking_prevalence', 
              'dec_tobacco_supply', 'dec_smoking_uptake', 'average_age', 
              'gender_idx', 'ethnicity_idx']
    
X = df[columns]
y = df[['qalys_pc']]
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_flat = y_train.values.flatten() # Ensure that y is a 1D array for compatibility

lr_model, lr_test_mape = build_lr(X_train, X_test, y_flat, y_test)

Test MAPE for Linear Regression Model: 2.3327569526580225


In [25]:
lr_model.predict(df_vape)

array([ 42.31504658,  41.12024592,  39.92544525,  18.51595231,
         7.86528718,  45.74392857,  44.5491279 ,  43.35432724,
        21.9448343 ,  11.29416916, 126.3156118 , 125.12081113,
       123.92601047,  68.96603201,  43.08231341, 129.74449378,
       128.54969312, 127.35489245,  72.39491399,  46.5111954 ])