In [None]:
!pip install pandas

In [None]:
import pandas as pd

# Patient Readmission Analysis

1. Problem statement:
   - Hospital wants to know why patients are readmitted after discharge OR
   - Why are patient readmitted OR
   - Understand why the rate of readmission is increasing
2. Clean the data
3. Transform the data where necessary
4. Analyse the data (what drives readmission)
   - Get insights
5. Make Recommendations

In [None]:
df = pd.read_csv('patient_readmission.csv')

In [None]:
df.shape

In [None]:
df.head()

## Exploring the Data

In [None]:
df.info()

In [116]:
# check for duplicates
df.duplicated().any()

False

In [117]:
df.duplicated().sum()

0

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# checking for null values
df.isnull().sum()

In [None]:
# drop null values for these two columns
df.dropna(subset=['admission_date', 'discharge_date'], inplace=True)

In [None]:
# reset the index after dropping the null values
df.reset_index(drop=True)

In [None]:
# convert date column from object to date data type
df['admission_date'] = pd.to_datetime(df['admission_date'])
df['discharge_date'] = pd.to_datetime(df['discharge_date'])

In [None]:
# create new column admission days (the number of days between a patient first day of admission and the discharge day)
df["admission_days"]  = (df['discharge_date'] - df['admission_date']).dt.days

In [None]:
# the bill amount has only plain numbers without spaces or commas, these code cleans the cells with commas to make them uniform
df['bill_amount'] = df['bill_amount'].str.replace(r'[^a-zA-Z0-9]', '', regex=True)

In [None]:
# convert the bill amount to int64 since it was in object data type
df['bill_amount'] = df['bill_amount'].astype('Int64')

In [None]:
# dealing with the NaN values in the insurance column
# replace them with 'Unknown'
df['insurance'] = df['insurance'].fillna('Unknown')

In [None]:
# a function that capitalizes the first letter of each value for all string columns and strips off white spaces
def clean_string_columns(df):
    str_cols = df.select_dtypes(include='object').columns
    for col in str_cols:
        df[col] = df[col].astype(str).str.strip().str.title()
    return df

# Usage:
df = clean_string_columns(df) 

In [None]:
# resets the index of the dataframe inplace
df.reset_index(drop=True, inplace= True)

Null values in the bill amount column

In [None]:
df['bill_amount'].describe()

In [None]:
# average bill amount for each diagnosis category
avg_bill_amount = df.groupby('diagnosis')['bill_amount'].mean().round(3)
avg_bill_amount

In [None]:
# replace null values in the bill amount column with average bill amounts for each diagnosis category 
# we want to fill each NaN in bill_amount with the correct group average (aligned perfectly by index) so we use transform('mean')
def replace_null_values(df):
    group_avg = df.groupby('diagnosis')['bill_amount'].transform('mean').round().astype(int)

    df['bill_amount'] = df['bill_amount'].fillna(group_avg)
    return df

# Apply the function to replace null values in 'bill_amount'
df = replace_null_values(df)

Final cleaned Dataset 

In [None]:
df.info()

In [None]:
df

Creating Bins for Age column

In [None]:
# working with age
df['age'].describe()

In [None]:
# use a function that creats bins for age aggregation
def age_group(age):
    if age <= 30:
        return "Youth (≤30)"
    elif age <= 45:
        return "Young Adult (31-45)"
    elif age <= 60:
        return "Middle Age (46-60)"
    else:
        return "Senior (61+)"

In [None]:
df['Age_group'] = df['age'].apply(age_group)

In [None]:
#mean value for diagnosis category Asthma is 295595.857 ~ 295596
# checking if that mean value was replaced for a NaN value

df[(df['bill_amount'] == 295596) & (df['diagnosis'] == 'Asthma')]

## Simple analysis

In [75]:
# period of data collection 
display(df['admission_date'].max(), df['admission_date'].min())

Timestamp('2024-07-31 00:00:00')

Timestamp('2024-06-02 00:00:00')

##### *Readmission analysis*

In [124]:
(df["readmitted"].value_counts(normalize=True) * 100).round(1) # 47% out of all patients admitted are readmitted

readmitted
No     53.4
Yes    46.6
Name: proportion, dtype: float64

Average stay in the hospital

In [125]:
df.groupby("readmitted")["admission_days"].mean().round(1)

readmitted
No     8.2
Yes    9.8
Name: admission_days, dtype: float64

By Diagnosis

In [85]:
# readmission rates by diagnosis type
(((df.groupby('diagnosis')['readmitted'].value_counts(normalize= True).unstack()) * 100).round()).sort_values(by= 'Yes', ascending= False)

readmitted,No,Yes
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
Hypertension,11.0,89.0
Diabetes,56.0,44.0
Cancer,58.0,42.0
Asthma,71.0,29.0
Fracture,71.0,29.0


By Department

In [84]:
# readmission rates by department
(((df.groupby('department')['readmitted'].value_counts(normalize= True).unstack()) * 100).round()).sort_values(by = 'Yes', ascending= False)

readmitted,No,Yes
department,Unnamed: 1_level_1,Unnamed: 2_level_1
Neurology,43.0,57.0
Cardiology,46.0,54.0
Oncology,54.0,46.0
Orthopedics,54.0,46.0
Pediatrics,67.0,33.0


By Gender

In [96]:
(((df.groupby('gender')['readmitted'].value_counts(normalize= True).unstack())* 100).round()).sort_values(by = 'Yes', ascending= False)

readmitted,No,Yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,48.0,52.0
Female,59.0,41.0


By Age

In [104]:
(((df.groupby('Age_group')['readmitted'].value_counts(normalize= True).unstack())* 100).round()).sort_values(by = 'Yes', ascending= False)

readmitted,No,Yes
Age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Young Adult (31-45),27.0,73.0
Youth (≤30),36.0,64.0
Middle Age (46-60),50.0,50.0
Senior (61+),76.0,24.0


By Doctors

In [101]:
(((df.groupby('doctor')['readmitted'].value_counts(normalize= True).unstack())* 100).round()).sort_values(by = 'Yes', ascending= False)

readmitted,No,Yes
doctor,Unnamed: 1_level_1,Unnamed: 2_level_1
Dr. Bello,27.0,73.0
Dr. Smith,46.0,54.0
Dr. Musa,54.0,46.0
Dr. Adams,55.0,45.0
Dr. Okeke,90.0,10.0


By City(cities with most readmissions)

In [119]:
(((df.groupby('city')['readmitted'].value_counts(normalize= True).unstack()) * 100).round()).sort_values(by = 'Yes', ascending= False)

readmitted,No,Yes
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Enugu,43.0,57.0
Kano,50.0,50.0
Lagos,50.0,50.0
Abuja,60.0,40.0
Port Harcourt,60.0,40.0


## 📊 Insights from the Analysis

### Data collection timeframe: 
 - **2nd June 2024** to **31st July 2024** according to the admission date column

### 1. Overall Readmission Rate
- **46.6% of patients were readmitted**.
- That’s nearly half — a significant challenge for the hospital.

### 2. Length of Stay
- Readmitted patients stayed **longer on average (≈9.8 days)** compared to those not readmitted (≈8.2 days).
- Longer initial stays may indicate more severe conditions or complications or gaps in the quality of care that extend recovery time.

### 3. By Diagnosis
- **Hypertension**: extremely high readmission at **89%** — an urgent red flag.
- **Diabetes (44%)** and **Cancer (42%)** patients also face moderate readmission risk.
- **Asthma (29%)** and **Fracture (29%)** have the lowest readmission rates.

### 4. By Department
- **Neurology (57%)** and **Cardiology (54%)** have the highest readmission rates.
- **Oncology (46%)** and **Orthopedics (46%)** are more balanced.
- **Pediatrics (33%)** has the lowest readmission rate.

### 5. By Gender 
- **Male (52%)** have the highest rates of readmission.
- **Females** have a reasmission rate of **(41%)**. 

### 6. By Age Group
- **Young Adults (31–45)**: highest at **73% readmission**.
- **Youth (≤30)**: **64% readmitted**.
- **Middle Age (46–60)**: balanced at **50%**.
- **Seniors (61+)**: lowest at **24%**, which is unusual and may reflect sample size or mortality bias.

### 7. By Doctors
- **Dr. Bello** has the highest rates of readmission at **73%** which may represent a higher proportion of complex cases under **Dr. Bello’s** care.
- **Dr. Smith** has a readmission rate of **54%**.
- **Dr. Musa** has a readmission rate of **46%** followed by **Dr. Adams** with a rate of **45%**.
- **Dr. Okeke** has the lowest admission rate at **10%**.

### 8. By Cities
- **Enugu**  has a high readmission rate at **57%**.
- Followed by **Kano** and **Lagos** at **50%**.
- **Abuja** and **Port Harcourt** have the least radmission rate at **40%**.
---

## 💡 Recommendations

1. **Focus on chronic conditions**: Provide better follow-up and treatment plans for Hypertension and Diabetes patients.
2. **Departmental reviews**: Cardiology and Neurology should strengthen discharge planning and patient education.
3. **Age-specific interventions**: Target young and middle-aged adults with lifestyle and adherence programs.
4. **Doctor Support**: Provide targeted support and professional development focusing on discharge planning, follow-up care, and management of complex cases.
5. **Cities Reviews**: Strengthen hospital resources and ensure patient follow-up in Enugu, Kano, Lagos to reduce preventable readmissions.
5. **Support for long-stay patients**: Provide additional discharge checks and care plans to reduce bounce-back readmissions.
