In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
!pip install scikit-learn==1.5.0
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import tree
from sklearn import linear_model
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.cross_decomposition import PLSRegression, PLSCanonical, CCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
!pip show scikit-learn
import pickle

Collecting scikit-learn==1.5.0
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikit-learn-1.5.0


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Name: scikit-learn
Version: 1.5.0
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bigframes, fastai, imbalanced-learn, librosa, mlxtend, sklearn-pandas, yellowbrick


In [3]:
# Data loading and preprocessing

filepath = '/content/drive/My Drive/CSV/final_results.csv'
data_df = pd.read_csv(filepath)
#remove useless data
data_df.drop(columns=["NCT Numbers", 'Top 5 Side Effects Female', 'Top 5 Side Effects Male', 'Most relevant studies'], inplace=True)

pd.set_option('display.max_columns',None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Do not wrap columns
print(data_df)

             Drug             Indication  Total Female Reports  Total Male Reports  Percentage Female Reports  Percentage Male Reports  Percentage Serious Female Reports  Percentage Serious Male Reports  Num Studies  Total females in studies  Total males in studies  Female proportion in studies  Male proportion in studies  Number of participants in most relevant studies  Number of female participants in most relevant studies  Number of male participants in most relevant studies  Proportion of females in most relevant studies  Proportion of males in most relevant studies  Prevalence Men  Prevalence Women  Prevalence Both Genders
0     CLOPIDOGREL  MYOCARDIAL INFARCTION                   178                 318                      35.89                    64.11                              96.63                            88.05         64.0                    6132.0                 17162.0                         26.32                       73.68                                         

In [9]:
# data normalization

def divide_by_100(x):
    return x/100

divide_transformer = FunctionTransformer(divide_by_100)
# data normalization
preprocessor = ColumnTransformer(
    transformers=[
        # OneHotEncode 'Indication'
        ('categorical', OneHotEncoder(), ['Indication']),

        # Scale percentages (assuming they are already in the range [0, 1])
        ('percentages', divide_transformer, [
            'Percentage Female Reports',
            'Percentage Male Reports',
            'Female proportion in studies',
            'Male proportion in studies',
            'Proportion of females in most relevant studies',
            'Proportion of males in most relevant studies'
        ]),

        # Scale 'Prevalence Men' and 'Prevalence Women' using RobustScaler
        # ('prevalence', RobustScaler(), ['Prevalence Men', 'Prevalence Women', 'Prevalence Both Genders']),

        # Scale 'Num Studies' using MinMaxScaler
        ('num', MinMaxScaler(), ['Num Studies', 'Total females in studies','Total males in studies', 'Number of participants in most relevant studies','Number of female participants in most relevant studies', 'Number of male participants in most relevant studies'])
    ]
)

transformed_data = preprocessor.fit_transform(data_df)

# Get the transformed column names
transformed_columns = (
    preprocessor.transformers_[0][1].get_feature_names_out(['Indication']).tolist() +
    [
        'Percentage Female Reports',
        'Percentage Male Reports',
        'Total females in studies',
        'Total males in studies',
        'Female proportion in studies',
        'Male proportion in studies',
        'Proportion of females in most relevant studies',
        'Proportion of males in most relevant studies',
        # 'Prevalence Men',
        # 'Prevalence Women',
        # 'Prevalence Both Genders',
        'Num Studies',
        'Number of participants in most relevant studies',
        'Number of female participants in most relevant studies',
        'Number of male participants in most relevant studies'
    ]
)


# Convert the transformed data to a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=transformed_columns)
print(transformed_df)

     Indication_ASTHMA  Indication_CHRONIC OBSTRUCTIVE PULMONARY DISEASE  Indication_DEMENTIA  Indication_DIABETES  Indication_EPILEPSY  Indication_HEART FAILURE  Indication_HYPERTENSION  Indication_ISCHEMIC HEART DISEASE  Indication_MULTIPLE SCLEROSIS  Indication_MYOCARDIAL INFARCTION  Indication_PARKINSON'S  Indication_SCHIZOPHRENIA  Indication_STROKE  Percentage Female Reports  Percentage Male Reports  Total females in studies  Total males in studies  Female proportion in studies  Male proportion in studies  Proportion of females in most relevant studies  Proportion of males in most relevant studies  Num Studies  Number of participants in most relevant studies  Number of female participants in most relevant studies  Number of male participants in most relevant studies
0                  0.0                                               0.0                  0.0                  0.0                  0.0                       0.0                      0.0                                

In [10]:
# define inputs and outputs
# exclude_columns = [
#     'Percentage Women Reports',
#     'Percentage Men Reports',
#     'Proportion of females in most relevant studies',
#     'Proportion of males in most relevant studies',
#     'Female proportion in studies',
#     'Male proportion in studies',
#     'Prevalence Men',
#     'Prevalence Women',
#     'Prevalence Both Genders',
# ]
# X = transformed_df.drop(columns=exclude_columns)
X = transformed_df[[
            'Indication_ASTHMA',
             'Indication_CHRONIC OBSTRUCTIVE PULMONARY DISEASE',
             'Indication_DEMENTIA',
             'Indication_DIABETES',
             'Indication_EPILEPSY',
             'Indication_HEART FAILURE',
             'Indication_HYPERTENSION',
             'Indication_ISCHEMIC HEART DISEASE',
             'Indication_MULTIPLE SCLEROSIS',
             'Indication_MYOCARDIAL INFARCTION',
             "Indication_PARKINSON'S",
             'Indication_SCHIZOPHRENIA',
             'Indication_STROKE',
            'Total females in studies',
            'Total males in studies',
            'Num Studies',
            'Number of participants in most relevant studies',
            'Number of female participants in most relevant studies',
            'Number of male participants in most relevant studies',
            ]]

y = transformed_df["Percentage Female Reports"]

# divide into test, train, val
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# define model
model = RandomForestRegressor(random_state=42)
# model = tree.DecisionTreeRegressor(random_state=42)
# model = linear_model.LinearRegression()
# model = Ridge(random_state=42)
# model = Lasso(random_state=42)
# model = lgb.LGBMRegressor(random_state=42)
# model = SGDRegressor(random_state=42)
# kernel = C(1.0, (1e-4, 1e1)) * RBF(1.0, (1e-4, 1e1))
# model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=15, random_state=42)
# model = HistGradientBoostingRegressor(random_state=42)

In [12]:
# parameter tuning
# random forest params
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(

In [13]:
# Baseline model prediction
# Calculate the mean of the target variable in the training set
n_samples = len(y_test)
mean_target = np.mean(y_train)

# Mean predictions (same value for all instances)
mean_predictions = np.full(n_samples, mean_target)

# Calculate MAE and MSE for the mean predictor
mean_mae = mean_absolute_error(y_test, mean_predictions)
mean_mse = mean_squared_error(y_test, mean_predictions)

print(f'Mean Predictor - MAE: {mean_mae}')
print(f'Mean Predictor - MSE: {mean_mse}')

Mean Predictor - MAE: 0.21552510033331063
Mean Predictor - MSE: 0.07107030916294399


In [14]:
# test set
y_test_pred = best_model.predict(X_test)

# evaluate performance
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)


print(f'Test MAE: {test_mae}')
print(f'Test MSE: {test_mse}')
print(f'Test R2: {test_r2}')

Test MAE: 0.19926457259495706
Test MSE: 0.07280708885923162
Test R2: -0.027902609723094995


In [15]:
# save model with pickle
save_path = '/content/drive/My Drive/Models/'

# Save the best model and preprocessor
with open(save_path + 'regression_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open(save_path + 'preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)