In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#load dataset (adjust filename if needed)
data_path = "../data/raw/readmission.csv" 

df = pd.read_csv(data_path)

print("Shape of dataset:", df.shape)
df.head()

#Data inspection
df.info()
df.describe()

#check missing values
df.isnull().sum().sort_values(ascending=False)
#checking missing values in percentage
(df.isnull().mean()*100).sort_values(ascending=False)

#fix missing data by adding special category
df=df.fillna({
    'max_glu_serum': 'Not Tested',
    'A1Cresult': 'Not Tested'
})

df[['max_glu_serum', 'A1Cresult']].isnull().sum()


#cleaner code
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#optional: make plots look nicer
sns.set(style='whitegrid', palette='muted')
%matplotlib inline

#load dataset
data_path = "../data/raw/readmission.csv"
df = pd.read_csv(data_path)

#fill missing values safely
df = df.fillna({
    'max_glu_serum': 'Not Tested',
    'A1Cresult': 'Not Tested'
})

#confirm no missing values remain
print("Missing values after filling:")
print(df[['max_glu_serum', 'A1Cresult']].isnull().sum())

#quick overview of readmission target
readmission_counts = df['readmitted'].value_counts(normalize=True) * 100
print("\nReadmission distribution (%):")
print(readmission_counts)

#simple plot
plt.figure(figsize=(6,4))
sns.countplot(x='readmitted', data=df)
plt.title("Distribution of Hospital Readmissions")
plt.show()

#quick check: Age vs Readmission
plt.figure(figsize=(10,5))
sns.countplot(x='age', hue='readmitted', data=df)
plt.title("Readmissions by Age Group")
plt.xticks(rotation=45)
plt.show()

#correlation heatmap (numeric columns only)
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm')
plt.title("Correlation Matrix (Numeric Features)")
plt.show()


SyntaxError: invalid syntax (3611689464.py, line 1)