In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import gzip
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data_path = 'data_new.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"The data file {data_path} does not exist.")
df = pd.read_csv(data_path)
df.head()

# Define age band columns and their corresponding numerical values
age_band_columns = [
    'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1',
    'dem_age_band_45-54_tm1', 'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1'
]
age_mapping = {
    'dem_age_band_18-24_tm1': 21,
    'dem_age_band_25-34_tm1': 30,
    'dem_age_band_35-44_tm1': 40,
    'dem_age_band_45-54_tm1': 50,
    'dem_age_band_55-64_tm1': 60,
    'dem_age_band_65-74_tm1': 70,
    'dem_age_band_75+_tm1': 80
}
df['age'] = 0
for band, age_val in age_mapping.items():
    if band in df.columns:
        df['age'] += df[band] * age_val

# Handle missing values by filling with median for numeric columns
df.fillna(df.median(numeric_only=True), inplace=True)

# Encode the categorical variable 'race' if necessary
if 'race' in df.columns and df['race'].dtype == 'object':
    df['race'] = LabelEncoder().fit_transform(df['race'])

# Define biomarker columns and create a consolidated 'biomarkers' feature
biomarker_columns = [
    'cre_min-low_tm1', 'cre_min-high_tm1', 'cre_min-normal_tm1',
    'cre_mean-low_tm1', 'cre_mean-high_tm1', 'cre_mean-normal_tm1',
    'cre_max-low_tm1', 'cre_max-high_tm1', 'cre_max-normal_tm1',
    'crp_min-low_tm1', 'crp_min-high_tm1', 'crp_min-normal_tm1',
    'crp_mean-low_tm1', 'crp_mean-high_tm1', 'crp_mean-normal_tm1',
    'crp_max-low_tm1', 'crp_max-high_tm1', 'crp_max-normal_tm1'
    # ... (add additional biomarker columns as needed)
]
non_empty_biomarker_columns = df[biomarker_columns].dropna(axis=1, how='all')
if len(non_empty_biomarker_columns.columns) < 10:
    random_10_biomarker_columns = non_empty_biomarker_columns.columns.tolist()
    print(f"Only {len(random_10_biomarker_columns)} biomarker columns available. Using all.")
else:
    random_10_biomarker_columns = non_empty_biomarker_columns.sample(n=10, axis=1, random_state=42).columns.tolist()
valid_biomarker_columns = [col for col in random_10_biomarker_columns if col in df.columns]
if valid_biomarker_columns:
    df['biomarkers'] = df[valid_biomarker_columns].apply(
        lambda row: 1 if any('normal' in col and row[col] > 0 for col in valid_biomarker_columns) else 0,
        axis=1
    )
else:
    df['biomarkers'] = 0
    print("No valid biomarker columns found. Setting 'biomarkers' to 0.")

# Define comorbidity columns and create a consolidated 'comorbidity' feature
comorbidity_columns = [
    'alcohol_elixhauser_tm1', 'anemia_elixhauser_tm1', 'arrhythmia_elixhauser_tm1',
    'arthritis_elixhauser_tm1', 'bloodlossanemia_elixhauser_tm1', 'coagulopathy_elixhauser_tm1',
    'compdiabetes_elixhauser_tm1', 'depression_elixhauser_tm1', 'drugabuse_elixhauser_tm1',
    'electrolytes_elixhauser_tm1', 'hypertension_elixhauser_tm1'
    # ... (add additional comorbidity columns as needed)
]
non_empty_comorbidity_columns = df[comorbidity_columns].dropna(axis=1, how='all')
if len(non_empty_comorbidity_columns.columns) < 10:
    random_10_comorbidity_columns = non_empty_comorbidity_columns.columns.tolist()
    print(f"Only {len(random_10_comorbidity_columns)} comorbidity columns available. Using all.")
else:
    random_10_comorbidity_columns = non_empty_comorbidity_columns.sample(n=10, axis=1, random_state=42).columns.tolist()
valid_comorbidity_columns = [col for col in random_10_comorbidity_columns if col in df.columns]
if valid_comorbidity_columns:
    df['comorbidity'] = df[valid_comorbidity_columns].apply(
        lambda row: 1 if any(row[col] > 0 for col in valid_comorbidity_columns) else 0,
        axis=1
    )
else:
    df['comorbidity'] = 0
    print("No valid comorbidity columns found. Setting 'comorbidity' to 0.")

# Define features and targets
features = [
    'age', 'dem_female', 'race', 'biomarkers', 'comorbidity',
    'lasix_dose_count_tm1', 'cre_tests_tm1', 'crp_tests_tm1', 'esr_tests_tm1',
    'ghba1c_tests_tm1', 'hct_tests_tm1', 'ldl_tests_tm1', 'nt_bnp_tests_tm1',
    'sodium_tests_tm1', 'trig_tests_tm1'
]
missing_features = [f for f in features if f not in df.columns]
if missing_features:
    print(f"Missing features from dataset: {missing_features}")

target_columns = ['risk_score_t', 'cost_t', 'bps_mean_t', 'gagne_sum_t', 'ldl_mean_t']
missing_targets = [t for t in target_columns if t not in df.columns]
if missing_targets:
    raise KeyError(f"Missing target columns: {missing_targets}")

# Prepare training and testing sets
if not missing_features and not missing_targets:
    X = df[features]
    y = df[target_columns]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save and compress the scaler
    with open('standard_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    with open('standard_scaler.pkl', 'rb') as fin, gzip.open('standard_scaler.pkl.gz', 'wb') as fout:
        fout.write(fin.read())

    # Use a lightweight linear model for multi-output regression
    model = MultiOutputRegressor(LinearRegression())
    model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_scaled)
    for idx, target in enumerate(target_columns):
        mse = mean_squared_error(y_test.iloc[:, idx], y_pred[:, idx])
        r2 = r2_score(y_test.iloc[:, idx], y_pred[:, idx])
        print(f"--- {target} ---")
        print(f"Mean Squared Error: {mse}")
        print(f"R-squared: {r2}")
        # For linear regression, coefficients can serve as a proxy for feature importance
        coefficients = model.estimators_[idx].coef_
        coef_df = pd.DataFrame({
            'Feature': features,
            'Coefficient': coefficients
        }).sort_values(by='Coefficient', key=abs, ascending=False)
        print(f"Coefficients for '{target}':")
        print(coef_df)
        print("\n")

    # Save and compress the model
    with open('multi_output_linear_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    with open('multi_output_linear_model.pkl', 'rb') as fin, gzip.open('multi_output_linear_model_compressed.pkl.gz', 'wb') as fout:
        fout.write(fin.read())
    print("Model training and evaluation completed successfully.")
else:
    print("Cannot proceed with model training due to missing features or target columns.")


--- risk_score_t ---
Mean Squared Error: 19.052080359758705
R-squared: 0.32714601843142577
Coefficients for 'risk_score_t':
                 Feature  Coefficient
4            comorbidity     1.096917
10         hct_tests_tm1     1.027510
0                    age     0.830940
12      nt_bnp_tests_tm1     0.569897
6          cre_tests_tm1     0.480123
13      sodium_tests_tm1     0.467044
8          esr_tests_tm1     0.453609
9       ghba1c_tests_tm1     0.419449
14        trig_tests_tm1    -0.352903
5   lasix_dose_count_tm1     0.325556
3             biomarkers    -0.176155
11         ldl_tests_tm1     0.036626
2                   race    -0.015352
7          crp_tests_tm1     0.009187
1             dem_female     0.000409


--- cost_t ---
Mean Squared Error: 291989351.3257087
R-squared: 0.09701699405394748
Coefficients for 'cost_t':
                 Feature  Coefficient
10         hct_tests_tm1  2908.027564
6          cre_tests_tm1  2580.747284
13      sodium_tests_tm1 -1793.881620
8  

 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
