In [4]:
import os

os.chdir(r"C:\Users\Vanshika Garg\healthcare-risk-provider-analytics")
print("Working directory set to:", os.getcwd())


Working directory set to: C:\Users\Vanshika Garg\healthcare-risk-provider-analytics


In [5]:
os.listdir()


['.git', '.idea', 'data', 'README.md', 'sql']

In [6]:
import pandas as pd

patients = pd.read_csv("data/raw/patients.csv")
encounters = pd.read_csv("data/raw/encounters.csv")
providers = pd.read_csv("data/raw/providers.csv")
readmissions = pd.read_csv("data/raw/readmissions.csv")


In [7]:
patients.shape

(1000, 5)

In [8]:
encounters.shape

(3000, 6)

In [9]:
providers.shape

(50, 3)

In [10]:
readmissions.shape

(3000, 2)

In [11]:
patients.columns

Index(['patient_id', 'age', 'gender', 'chronic_condition', 'region'], dtype='object')

In [12]:
encounters.columns

Index(['encounter_id', 'patient_id', 'provider_id', 'admit_date',
       'length_of_stay', 'discharge_date'],
      dtype='object')

In [13]:
providers.columns

Index(['provider_id', 'department', 'experience_years'], dtype='object')

In [14]:
readmissions.columns

Index(['encounter_id', 'readmitted_30_days'], dtype='object')

In [15]:
patients["patient_id"].is_unique        # should be True

True

In [16]:

encounters["encounter_id"].is_unique    # should be True

True

In [17]:
df = (
    encounters
    .merge(patients, on="patient_id", how="left")
    .merge(readmissions, on="encounter_id", how="left")
    .merge(providers, on="provider_id", how="left")
)


In [18]:
df.shape

(3000, 13)

In [19]:
df.columns

Index(['encounter_id', 'patient_id', 'provider_id', 'admit_date',
       'length_of_stay', 'discharge_date', 'age', 'gender',
       'chronic_condition', 'region', 'readmitted_30_days', 'department',
       'experience_years'],
      dtype='object')

In [20]:
df.head(10)

Unnamed: 0,encounter_id,patient_id,provider_id,admit_date,length_of_stay,discharge_date,age,gender,chronic_condition,region,readmitted_30_days,department,experience_years
0,1,213,16,2023-08-17,7,2023-08-24,86,Male,Hypertension,South,1,General Medicine,29
1,2,310,28,2023-08-28,8,2023-09-05,74,Male,Hypertension,North,0,Cardiology,15
2,3,661,48,2023-12-29,1,2023-12-30,84,Male,Diabetes,North,1,Orthopedics,16
3,4,968,35,2023-11-06,3,2023-11-09,18,Female,,South,0,General Medicine,5
4,5,753,45,2023-05-21,7,2023-05-28,41,Male,,North,0,Cardiology,22
5,6,272,46,2023-06-05,13,2023-06-18,63,Male,Heart Disease,East,0,Cardiology,4
6,7,571,8,2023-10-26,9,2023-11-04,23,Female,Hypertension,South,1,Neurology,5
7,8,799,44,2023-12-11,8,2023-12-19,41,Male,,East,0,Neurology,15
8,9,938,22,2023-04-09,8,2023-04-17,42,Female,Diabetes,South,0,Neurology,4
9,10,202,21,2023-02-16,9,2023-02-25,67,Male,Diabetes,East,1,Cardiology,15


In [21]:
df["encounter_id"].is_unique

True

In [22]:
df.isnull().sum()

encounter_id            0
patient_id              0
provider_id             0
admit_date              0
length_of_stay          0
discharge_date          0
age                     0
gender                  0
chronic_condition     878
region                  0
readmitted_30_days      0
department              0
experience_years        0
dtype: int64

In [23]:
# FEATURE ENGINEERING
#marking age group

df["is_elderly"]= df["age"]>=60

In [24]:
df["has_chronic_cond"]= df["chronic_condition"] !="None"

In [25]:
df.head()

Unnamed: 0,encounter_id,patient_id,provider_id,admit_date,length_of_stay,discharge_date,age,gender,chronic_condition,region,readmitted_30_days,department,experience_years,is_elderly,has_chronic_cond
0,1,213,16,2023-08-17,7,2023-08-24,86,Male,Hypertension,South,1,General Medicine,29,True,True
1,2,310,28,2023-08-28,8,2023-09-05,74,Male,Hypertension,North,0,Cardiology,15,True,True
2,3,661,48,2023-12-29,1,2023-12-30,84,Male,Diabetes,North,1,Orthopedics,16,True,True
3,4,968,35,2023-11-06,3,2023-11-09,18,Female,,South,0,General Medicine,5,False,True
4,5,753,45,2023-05-21,7,2023-05-28,41,Male,,North,0,Cardiology,22,False,True


In [26]:
visit_counts = df.groupby("patient_id")["encounter_id"].transform("count")
df["frequent_visitor"]= visit_counts>=3

In [27]:
df[["is_elderly", "has_chronic_cond", "frequent_visitor"]].mean()*100


is_elderly           42.7
has_chronic_cond    100.0
frequent_visitor     78.7
dtype: float64

In [28]:
df["chronic_condition"].value_counts()


chronic_condition
Diabetes         800
Hypertension     703
Heart Disease    619
Name: count, dtype: int64

In [29]:
df["has_chronic_condition"] = df["chronic_condition"].str.strip().str.lower() != "none"


In [30]:
df[["is_elderly", "has_chronic_condition", "frequent_visitor"]].mean()


is_elderly               0.427
has_chronic_condition    1.000
frequent_visitor         0.787
dtype: float64

In [31]:
df["chronic_condition"].value_counts(dropna=False)


chronic_condition
NaN              878
Diabetes         800
Hypertension     703
Heart Disease    619
Name: count, dtype: int64

In [32]:
df["has_chronic_condition"] = df["chronic_condition"].notna()


In [33]:
df[["is_elderly", "has_chronic_condition", "frequent_visitor"]].mean()*100


is_elderly               42.700000
has_chronic_condition    70.733333
frequent_visitor         78.700000
dtype: float64

In [34]:
#Compare readmission rates
rates = df.groupby("is_elderly")["readmitted_30_days"].mean()

elderly_rate = rates.loc[True]
non_elderly_rate = rates.loc[False]

risk_ratio = elderly_rate / non_elderly_rate

print(f"Elderly readmission rate: {elderly_rate:.2%}")
print(f"Non-elderly readmission rate: {non_elderly_rate:.2%}")
print(f"Elderly patients show ~{risk_ratio:.2f}× higher readmission probability.")


Elderly readmission rate: 52.69%
Non-elderly readmission rate: 31.82%
Elderly patients show ~1.66× higher readmission probability.


In [35]:
rates = df.groupby("frequent_visitor")["readmitted_30_days"].mean()

freq_rate = rates.loc[True]
non_freq_rate = rates.loc[False]

risk_ratio = freq_rate / non_freq_rate

print(f"Frequent visitor readmission rate: {freq_rate:.2%}")
print(f"Non-frequent visitor readmission rate: {non_freq_rate:.2%}")
print(f"Frequent visitors show ~{risk_ratio:.2f}× higher readmission risk.")


Frequent visitor readmission rate: 41.13%
Non-frequent visitor readmission rate: 39.28%
Frequent visitors show ~1.05× higher readmission risk.


In [36]:
#RISK SCORING
#ASSUMPTION
df["risk_score"] = (
    df["is_elderly"].astype(int) * 0.3 +
    df["has_chronic_condition"].astype(int) * 0.4 +
    df["frequent_visitor"].astype(int) * 0.3
)


In [37]:
df["risk_category"] = pd.cut(
    df["risk_score"],
    bins=[-1, 0.3, 0.6, 1],
    labels=["Low", "Medium", "High"]
)


In [38]:
df.groupby("risk_category", observed=True)["readmitted_30_days"].mean()*100



risk_category
Low       11.591356
Medium    36.322188
High      50.409165
Name: readmitted_30_days, dtype: float64

In [39]:
treatments=pd.read_csv("data/raw/treatments.csv")

In [40]:
#COSTING
cost_per_encounter=(
    treatments.groupby("encounter_id")["treatment_cost"]
    .sum()
    .reset_index()
)

In [41]:
df = df.merge(cost_per_encounter, on="encounter_id", how="left")


In [42]:
df.groupby("risk_category", observed=True)["treatment_cost"].mean()


risk_category
Low        97746.601179
Medium    102459.370821
High      103501.082379
Name: treatment_cost, dtype: float64

In [43]:
#PATIENT-LEVEL RISK SUMMARY

patient_risk = (
    df.groupby("patient_id")
      .agg(
          max_risk_score=("risk_score", "max"),
          risk_category=("risk_category", "max"),
          total_encounters=("encounter_id", "count"),
          avg_treatment_cost=("treatment_cost", "mean")
      )
      .reset_index()
)


In [44]:
df.head()

Unnamed: 0,encounter_id,patient_id,provider_id,admit_date,length_of_stay,discharge_date,age,gender,chronic_condition,region,readmitted_30_days,department,experience_years,is_elderly,has_chronic_cond,frequent_visitor,has_chronic_condition,risk_score,risk_category,treatment_cost
0,1,213,16,2023-08-17,7,2023-08-24,86,Male,Hypertension,South,1,General Medicine,29,True,True,False,True,0.7,High,59649
1,2,310,28,2023-08-28,8,2023-09-05,74,Male,Hypertension,North,0,Cardiology,15,True,True,True,True,1.0,High,107986
2,3,661,48,2023-12-29,1,2023-12-30,84,Male,Diabetes,North,1,Orthopedics,16,True,True,True,True,1.0,High,88749
3,4,968,35,2023-11-06,3,2023-11-09,18,Female,,South,0,General Medicine,5,False,True,True,False,0.3,Low,146295
4,5,753,45,2023-05-21,7,2023-05-28,41,Male,,North,0,Cardiology,22,False,True,True,False,0.3,Low,184238


In [45]:
patient_risk.to_csv(
    "data/processed/patient_risk_summary.csv",
    index=False
)

In [46]:
#Will the patient be readmitted within 30 days?

y = df["readmitted_30_days"]

In [47]:
X = df[["age", "length_of_stay", "risk_score"]]

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [49]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

In [50]:
model.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [51]:
y_pred = model.predict(X_test)

In [52]:
from sklearn.metrics import recall_score, precision_score

recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

recall, precision

(0.30364372469635625, 0.5769230769230769)

In [53]:
import pandas as pd

coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", ascending=False)

coef_df


Unnamed: 0,feature,coefficient
2,risk_score,2.555323
1,length_of_stay,0.00836
0,age,0.005611


In [56]:
y_prob = model.predict_proba(X_test)

In [57]:
readmit_prob = y_prob[:, 1]


In [58]:
df_test = X_test.copy()
df_test["readmission_probability"] = readmit_prob


In [59]:
df_test["ml_risk_category"] = pd.cut(
    df_test["readmission_probability"],
    bins=[0, 0.3, 0.6, 1.0],
    labels=["Low", "Medium", "High"]
)


In [60]:
df_test["actual_readmission"] = y_test.values

df_test.groupby("ml_risk_category", observed=True)["actual_readmission"].mean()


ml_risk_category
Low       0.253247
Medium    0.420886
High      0.576923
Name: actual_readmission, dtype: float64

In [61]:
df_test.to_csv(
    "data/processed/ml_readmission_predictions.csv",
    index=False
)
