In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler


# Set the correct file path for the medical data
medical_file_path = "medical_clean.csv"
# Read the medical data file with keep_default_na
df = pd.read_csv(medical_file_path, keep_default_na=False, index_col=0)



# Code to check for duplicates
has_duplicates = df.duplicated().any()
print("Duplicates present:", has_duplicates)

# Check for missing data
missing_data = df.isnull().sum()

# Display the missing data counts
print("Missing data counts:")
print(missing_data)

# Display data types
df.info()
# Visually inspect df
pd.set_option("display.max_columns", None)
df.head(5)



# Summary statistics for the dependent variable
TotalCharge_summary = df["TotalCharge"].describe()
print("Summary Statistics for TotalCharge:")
print(TotalCharge_summary)

# Display value counts for the specified columns
columns_to_count = ["Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Stroke", "Diabetes", "Additional_charges", "Initial_admin"]

# Create a dictionary to store value counts
counts_dict = {col: df[col].value_counts().sort_index() for col in columns_to_count}

# Display the value counts
for col, counts in counts_dict.items():
    print(f"\n{col} counts:")
    print(counts)

  
    


# Total Charges Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), gridspec_kw={'height_ratios': [4, 1]})

# Histogram for Total Charges
ax1.hist(df["TotalCharge"], bins=30, color='red', alpha=0.7)
ax1.set_title('Histogram of TotalCharge')
ax1.set_xlabel('TotalCharge')
ax1.set_ylabel('Frequency')
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Box plot for Total Charges
ax2.boxplot(df["TotalCharge"], vert=False, patch_artist=True)
ax2.set_title('Box Plot of TotalCharge')
ax2.set_xlabel('TotalCharge')
ax2.grid(axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


# Children Univariate and Bivariate
Children_counts = df["Children"].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart for # Children
sns.barplot(x=Children_counts.index, y=Children_counts.values, palette="Set3", ax=ax1)
ax1.set_title('Univariate Analysis of Children')
ax1.set_xlabel('Number of Children')
ax1.set_ylabel('Count of Patients')
for index, value in enumerate(Children_counts.values):
    ax1.text(index, value, str(value), ha='center', va='bottom')

# Box plot for Children and Total Charges
sns.boxplot(x=df["Children"], y=df["TotalCharge"], ax=ax2, palette="Set3")
ax2.set_title('Bivariate Analysis of Children and Total Charges')
ax2.set_xlabel('Number of Children')
ax2.set_ylabel('Total Charges')

plt.tight_layout()
plt.show()



# Age Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Histogram for Age
sns.histplot(df["Age"], kde=True, bins=30, color="orange", ax=ax1)
ax1.set_title('Univariate Analysis of Age')
ax1.set_xlabel('Age')
ax1.set_ylabel('Count of Patients')

# Box plot for Age and Total Charges
sns.boxplot(x=df["Age"], y=df["TotalCharge"], ax=ax2)
ax2.set_title('Bivariate Analysis of Age and Total Charges')
ax2.set_xlabel('Age')
ax2.set_ylabel('Total Charges')

plt.tight_layout()
plt.show()


# Doc_visits Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Graph for Doc_visits
sns.histplot(df["Doc_visits"], kde=True, bins=30, color="yellow", ax=ax1)
ax1.set_title('Univariate Analysis of Doctor Visits')
ax1.set_xlabel('Doc_visits')
ax1.set_ylabel('Count of Patients')

# Graph for Doc_visits and TotalCharge
plt.title("Bivariate Analysis of Doc_visits and Total Charges")
sns.regplot(data=df, x="Doc_visits", y="TotalCharge", scatter_kws={'alpha': 1/10})
plt.xlabel('Doc_visits')
plt.ylabel('Total Charges')
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.show()


# Full_meals_eaten Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Graph for Full_meals_eaten
sns.histplot(df["Full_meals_eaten"], kde=True, bins=30, color="green", ax=ax1)
ax1.set_title('Univariate Analysis of Full_meals_eaten')
ax1.set_xlabel('Full_meals_eaten')
ax1.set_ylabel('Count of Patients')

# Graph for Full_meals_eaten and TotalCharge
plt.title("Bivariate Analysis of Full_meals_eaten and Total Charges")
sns.regplot(data=df, x="Full_meals_eaten", y="TotalCharge", scatter_kws={'alpha': 1/10})
plt.xlabel('Full_meals_eaten')
plt.ylabel('Total Charges')
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.show()


# Income Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Histogram for Income
sns.histplot(df["Income"], kde=True, bins=30, color='blue', ax=ax1)
ax1.set_title('Univariate Analysis of Income')
ax1.set_xlabel('Income')
ax1.set_ylabel('Count of Patients')

# Scatterplot of Income and Total Charges
sns.scatterplot(x=df["Income"], y=df["TotalCharge"], ax=ax2, alpha=0.5)
ax2.set_title('Bivariate Analysis of Income and Total Charges')
ax2.set_xlabel('Income')
ax2.set_ylabel('Total Charges')

plt.tight_layout()
plt.show()

# VitD_levels Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Histogram for VitD_levels
sns.histplot(df["VitD_levels"], kde=True, bins=30, color="purple", ax=ax1)
ax1.set_title('Univariate Analysis of VitD_levels')
ax1.set_xlabel('VitD_levels')
ax1.set_ylabel('Count of Patients')

# Scatterplot of Income and Total Charges
sns.scatterplot(x=df["VitD_levels"], y=df["TotalCharge"], ax=ax2, alpha=0.5)
ax2.set_title('Bivariate Analysis of VitD_levels and Total Charges')
ax2.set_xlabel('VitD_levels')
ax2.set_ylabel('Total Charges')

plt.tight_layout()
plt.show()

# Additional_charges Univariate and Bivariate
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Histogram for Additional_charges
sns.histplot(df["Additional_charges"], kde=True, bins=30, color='teal', ax=ax1)
ax1.set_title('Univariate Analysis of Additional charges')
ax1.set_xlabel('Additional_charges')
ax1.set_ylabel('Count of Patients')

# Scatterplot of Additional_charges and Total Additional_charges
sns.scatterplot(x=df["Additional_charges"], y=df["TotalCharge"], ax=ax2, alpha=0.5)
ax2.set_title('Bivariate Analysis of Additional charges and Total Charges')
ax2.set_xlabel('Additional charges')
ax2.set_ylabel('Total Charges')

plt.tight_layout()
plt.show()

# Initial_Admin Univariate and Bivariate
Initial_admin_counts = df["Initial_admin"].value_counts()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Univariate Analysis of Initial_admin
ax1.pie(Initial_admin_counts, labels=Initial_admin_counts.index, autopct='%1.1f%%', startangle=140)
ax1.set_title('Univariate Analysis of Initial_admin')
ax1.axis('equal')  

# Bivariate Analysis of Initial_admin and TotalCharge
sns.boxplot(x=df["Initial_admin"], y=df["TotalCharge"], ax=ax2)
ax2.set_title('Bivariate Analysis of Initial_admin and TotalCharge')
ax2.set_xlabel('Initial_admin')
ax2.set_ylabel('Total Charges')

plt.tight_layout()
plt.show()


# Generate visualizations for other variables
variables = ["HighBlood", "Stroke", "Diabetes"]

for var in variables:
    counts = df[var].value_counts()
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Pie Chart
    ax1.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=sns.color_palette("rainbow"), startangle=140)
    ax1.set_title(f'Univariate Analysis of {var}')
    ax1.axis('equal') 

    # Box Plot
    sns.boxplot(x=df[var], y=df["TotalCharge"], ax=ax2, palette="rainbow")
    ax2.set_title(f'Bivariate Analysis of {var} and TotalCharge')
    ax2.set_xlabel(var)
    ax2.set_ylabel('Total Charges')
    
    plt.tight_layout()
    plt.show()



# Data Transformation needed
# Update currency to 3 decimal places
df["Income"] = df["Income"].astype(int)

# Update Vitamin D levels to 3 decimal places
df["VitD_levels"] = df["VitD_levels"].astype(int)

# Update Initial days to 3 decimal places
df["Initial_days"] = df["Initial_days"].astype(int)

# Update Additional Charges to 3 decimal places
df["Additional_charges"] = df["Additional_charges"].astype(int)

# Convert columns to boolean
bool_mapping = {"Yes": 1, "No": 0}
columns_to_convert = ["HighBlood", "Stroke", "Diabetes"]
for col in columns_to_convert:
    df[col] = df[col].map(bool_mapping)

    
# Convert columns to category
df["Initial_admin"] = df["Initial_admin"].astype("category")

# Generate columns of dummy values
initial_admit_df = pd.get_dummies(data=df["Initial_admin"], drop_first=True)

# Create new df with model variables
model_df = df[[ "Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Stroke", "Diabetes", "Initial_days", "Additional_charges"]].copy()


# Dummies for Initial Admit
model_df["initial_admit_elect"] = initial_admit_df["Emergency Admission"].astype(int)
model_df["initial_admit_obs"] = initial_admit_df["Observation Admission"].astype(int)


# Assuming "TotalCharge" is in df but not in model_df, copy it to model_df for analysis
model_df["TotalCharge"] = df["TotalCharge"]


# Visually inspect df
pd.set_option("display.max_columns", None)
model_df.head(5)

# Save model_df to a CSV file
model_df.to_csv("model_df.csv", index=False)
print("model_df has been saved to model_df.csv'.")




# Initial Multiple Linear Regression Model
y = model_df["TotalCharge"]

X = model_df[[
    "Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Stroke", "Diabetes", "Additional_charges" ,"initial_admit_elect", "initial_admit_obs"
]]

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

print(results.summary())


# Residual Standard Error
results.resid.std(ddof=X.shape[1])



# Calculate VIF
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

X = model_df[[
      "Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Stroke", "Diabetes",  "Additional_charges" ,"initial_admit_elect", "initial_admit_obs"
]]

X = sm.add_constant(X)

vif_df = calculate_vif(X)
print(vif_df)


# Apply MinMaxScaler to df
scaler = MinMaxScaler()
regress_df = pd.DataFrame(scaler.fit_transform(model_df), columns=model_df.columns)
# Display the scaled DataFrame
print(regress_df)



# Model Reduction # 1: Find p-value above 0.05
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Stroke", "Diabetes", "initial_admit_elect", "initial_admit_obs"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())  

# Model Reduction # 2: Find p-value above 0.05 (removed Stroke with p-value of .754)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Diabetes", "initial_admit_elect", "initial_admit_obs"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


# Model Reduction # 3: Find  p-value above 0.05 (removed VitD_levels with p-value of .681)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Income", "Doc_visits", "Full_meals_eaten", "HighBlood", "Diabetes", "initial_admit_elect", "initial_admit_obs"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


# Model Reduction #4: Find  p-value above 0.05 (removed Doc_visits with p-value of .574)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Income", "Full_meals_eaten", "HighBlood", "Diabetes", "initial_admit_elect", "initial_admit_obs"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

# Model Reduction #5: Find  p-value above 0.05 (removed initial_admit_obs with p-value of .537)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Income", "Full_meals_eaten", "HighBlood", "Diabetes", "initial_admit_elect"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())



# Model Reduction #6: Find  p-value above 0.05 (removed Diabetes with p-value of .228)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Income", "Full_meals_eaten", "HighBlood", "initial_admit_elect"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


# Model Reduction #7: Find  p-value above 0.05 (removed Income with p-value of .222)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "Full_meals_eaten", "HighBlood", "initial_admit_elect"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


# Model Reduction #8: Find  p-value above 0.05 (removed Full_meals_eaten with p-value of .119)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "Age", "HighBlood", "initial_admit_elect"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())


# Model Reduction #9: Find  p-value above 0.05 (removed Age with p-value of .087)
y = regress_df.TotalCharge
X = regress_df[[ "Children", "HighBlood", "initial_admit_elect"]].assign(const=1)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())



# Residual Standard Error
results.resid.std(ddof=X.shape[1])



# Residual Plot
y = model_df["TotalCharge"]
X = model_df[[
    "Children", "Age", "Income", "VitD_levels", "Doc_visits", "Full_meals_eaten", "HighBlood", "Stroke", "Diabetes", "Additional_charges" ,"initial_admit_elect", "initial_admit_obs"
]]

data = pd.concat([X, y], axis=1).dropna()

X = data.drop(columns=["TotalCharge"])
y = data["TotalCharge"]

X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()

predicted_values = results.fittedvalues
residuals = results.resid

plt.scatter(predicted_values, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()