In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [None]:

# Load the dataset
data = pd.read_csv('medical_clean.csv')

In [None]:
# Identify numerical and categorical columns
numerical_columns = ['Lat', 'Lng', 'Population', 'Children', 'Age', 'Income', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 'Initial_days', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8']
categorical_columns = ['City', 'State', 'County', 'Area', 'TimeZone', 'Job', 'Marital', 'Gender', 'ReAdmis', 'Initial_admin', 'HighBlood', 'Stroke', 'Complication_risk', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Services']

In [None]:
# Separate predictor variables (X) and the target variable (y)
X = data.drop('TotalCharge', axis=1)
y = data['TotalCharge']

In [None]:
# Define a column transformer
preprocessor = make_column_transformer(
    (StandardScaler(), numerical_columns),
    (OneHotEncoder(handle_unknown='ignore'), categorical_columns)
)

In [None]:
# Prepare the dataset using the preprocessor
X_prepared = preprocessor.fit_transform(X)

In [None]:
# Convert the transformed data into a DataFrame
X_prepared_df = pd.DataFrame.sparse.from_spmatrix(X_prepared)

In [None]:
# Create column names for the new DataFrame
numerical_column_names = numerical_columns
categorical_column_names = preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(categorical_columns)

In [None]:
# Combine the column names
column_names = numerical_column_names + categorical_column_names.tolist()

In [None]:
# Assign the column names to the DataFrame
X_prepared_df.columns = column_names

In [None]:
# Add the 'TotalCharge' column
X_prepared_df['TotalCharge'] = y.values


In [None]:
# Save the prepared data to a new CSV file
X_prepared_df.to_csv('medical_prepared.csv', index=False)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the cleaned data set
data = pd.read_csv('medical_prepared.csv')

In [None]:
# Univariate histograms
for col in data.columns:
    sns.histplot(data[col], kde=True)
    plt.title(f'{col} Distribution')
    plt.show()

In [None]:
# Bivariate scatter plots
for col in data.columns:
    sns.scatterplot(data=data, x=col, y='TotalCharge')
    plt.title(f'{col} vs. TotalCharge')
    plt.show()

In [None]:
import statsmodels.api as sm

# Create dummy variables for categorical predictors
X_cat = pd.get_dummies(X[categorical_predictors], drop_first=True)

In [None]:
# Combine continuous predictors with dummy variables
X_all = pd.concat([X[continuous_predictors], X_cat], axis=1)

In [None]:
# Add constant to the predictor variables
X_all = sm.add_constant(X_all)

In [None]:
# Fit multiple linear regression model
initial_model = sm.OLS(y, X_all).fit()

In [None]:
# Print summary of the model
print(initial_model.summary())

In [None]:
import statsmodels.api as sm

# Create dummy variables for categorical predictors
X_cat = pd.get_dummies(X[categorical_predictors], drop_first=True)

# Combine continuous predictors with dummy variables
X_all = pd.concat([X[continuous_predictors], X_cat], axis=1)

# Add constant to the predictor variables
X_all = sm.add_constant(X_all)

# Fit multiple linear regression model
model = sm.OLS(y, X_all).fit()

# Print summary of the model
print(model.summary())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error

# Load the cleaned dataset
df = pd.read_csv('medical_clean.csv')

# Identify the predictor variables (X) and target variable (y)
X = df.drop('TotalCharge', axis=1)
y = df['TotalCharge']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the categorical and numerical column transformers
categorical_columns = ['City', 'State', 'County', 'Area', 'TimeZone', 'Job', 'Marital', 'Gender', 'ReAdmis', 'Initial_admin', 'HighBlood', 'Stroke', 'Complication_risk', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Services']
numerical_columns = ['Lat', 'Lng', 'Population', 'Children', 'Age', 'Income', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 'Initial_days', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8']

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
numerical_transformer = make_pipeline(StandardScaler())

# Define the column transformer
preprocessor = make_column_transformer(
    (categorical_transformer, categorical_columns),
    (numerical_transformer, numerical_columns),
)

# Create the initial multiple regression model
initial_model = make_pipeline(preprocessor, LinearRegression())
initial_model.fit(X_train, y_train)

# Evaluate the initial model on the testing set
y_pred = initial_model.predict(X_test)
initial_r2 = r2_score(y_test, y_pred)
initial_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Create the reduced multiple regression model
reduced_columns = ['Population', 'Age', 'Income', 'Doc_visits', 'Initial_days', 'Item1', 'Item5', 'Item6', 'Item7', 'Item8', 'Area', 'Job', 'Marital', 'Gender', 'ReAdmis', 'HighBlood', 'Stroke', 'Complication_risk', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Services']

reduced_categorical_columns = [col for col in categorical_columns if col in reduced_columns]
reduced_numerical_columns = [col for col in numerical_columns if col in reduced_columns]

reduced_categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
reduced_numerical_transformer = make_pipeline(StandardScaler())

reduced_preprocessor = make_column_transformer(
    (reduced_categorical_transformer, reduced_categorical_columns),
    (reduced_numerical_transformer, reduced_numerical_columns),
)

reduced_model = make_pipeline(reduced_preprocessor, LinearRegression())
reduced_model.fit(X_train[


In [2]:
import pandas as pd

In [3]:
# Load the cleaned dataset
data = pd.read_csv('medical_clean.csv')

In [4]:
# Identify numerical and categorical columns
numerical_columns = ['Lat', 'Lng', 'Population', 'Children', 'Age', 'Income', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 'Initial_days', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8']
categorical_columns = ['City', 'State', 'County', 'Area', 'TimeZone', 'Job', 'Marital', 'Gender', 'ReAdmis', 'Initial_admin', 'HighBlood', 'Stroke', 'Complication_risk', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Services']

In [5]:
# Summary statistics for numerical columns
numerical_summary = data[numerical_columns].describe()
print("Summary statistics for numerical columns:")
print(numerical_summary)

Summary statistics for numerical columns:
                Lat           Lng     Population      Children           Age  \
count  10000.000000  10000.000000   10000.000000  10000.000000  10000.000000   
mean      38.751099    -91.243080    9965.253800      2.097200     53.511700   
std        5.403085     15.205998   14824.758614      2.163659     20.638538   
min       17.967190   -174.209700       0.000000      0.000000     18.000000   
25%       35.255120    -97.352982     694.750000      0.000000     36.000000   
50%       39.419355    -88.397230    2769.000000      1.000000     53.000000   
75%       42.044175    -80.438050   13945.000000      3.000000     71.000000   
max       70.560990    -65.290170  122814.000000     10.000000     89.000000   

              Income   VitD_levels    Doc_visits  Full_meals_eaten  \
count   10000.000000  10000.000000  10000.000000      10000.000000   
mean    40490.495160     17.964262      5.012200          1.001400   
std     28521.153293      2

In [6]:
# Summary statistics for categorical columns
print("\nSummary statistics for categorical columns:")
for col in categorical_columns:
    print(f"\n{col} value counts:")
    print(data[col].value_counts())


Summary statistics for categorical columns:

City value counts:
Houston             36
San Antonio         26
Springfield         22
New York            21
Miami               21
                    ..
Coyote               1
Tiline               1
Monon                1
Sullivans Island     1
Coraopolis           1
Name: City, Length: 6072, dtype: int64

State value counts:
TX    553
CA    550
PA    547
NY    514
IL    442
OH    383
MO    328
FL    304
VA    287
IA    276
MI    273
MN    267
NC    254
GA    247
KS    220
WI    214
KY    210
OK    207
WV    207
IN    195
AL    194
TN    194
WA    191
AR    190
NE    185
CO    179
NJ    176
LA    173
MA    149
MS    134
MD    131
SC    128
SD    123
ME    122
OR    122
MT    112
NM    110
ID    109
ND    108
AZ    108
CT     80
NH     79
UT     72
AK     70
VT     60
NV     51
WY     51
PR     43
HI     34
DE     17
RI     14
DC     13
Name: State, dtype: int64

County value counts:
Jefferson      118
Washington     100
Franklin        

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
data = pd.read_csv('medical_clean.csv')

In [3]:
# Define the target variable and the feature matrix
target = 'TotalCharge'
X = data.drop(columns=[target])
y = data[target]

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Identify numerical and categorical columns
numerical_columns = [column_name for column_name in X.columns if X[column_name].dtype in [np.float64, np.int64]]
categorical_columns = [column_name for column_name in X.columns if column_name not in numerical_columns]


In [6]:
# Create transformers for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


In [7]:
# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [8]:
# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the testing data
X_test_transformed = preprocessor.transform(X_test)

# Create a Ridge regression model
ridge_model = Ridge(alpha=1.0, random_state=42)

In [9]:
# Fit the Ridge regression model to the training data
ridge_model.fit(X_train_transformed, y_train)

Ridge(random_state=42)

In [10]:
# Make predictions using the Ridge regression model
y_pred_train = ridge_model.predict(X_train_transformed)
y_pred_test = ridge_model.predict(X_test_transformed)


In [11]:
# Calculate the mean squared error (MSE) and R-squared (R2) for the Ridge regression model
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

In [12]:
print("Training set:")
print(f"MSE: {mse_train:.2f}")
print(f"R2: {r2_train:.2f}")

Training set:
MSE: 5.07
R2: 1.00


In [13]:
print("\nTest set:")
print(f"MSE: {mse_test:.2f}")
print(f"R2: {r2_test:.2f}")


Test set:
MSE: 22.96
R2: 1.00


In [14]:
# Calculate the residuals for the training and testing sets
residuals_train = y_train - y_pred_train
residuals_test = y_test - y_pred_test

In [15]:
# Create a DataFrame to store the actual and predicted values, as well as the residuals
train_results = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred_train, 'Residual': residuals_train})
test_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test, 'Residual': residuals_test})


In [16]:
# Display the results for the training and testing sets
print("Training set results:")
print(train_results.head(10))
print("\nTest set results:")
print(test_results.head(10))

Training set results:
           Actual    Predicted  Residual
9254  7749.506000  7750.959775 -1.453775
1561  3303.446900  3300.923577  2.523323
1670  2330.569902  2333.591841 -3.021939
6087  7980.623000  7979.775252  0.847748
6669  7284.532000  7285.970774 -1.438774
5933  8454.486000  8452.661964  1.824036
8829  8261.154000  8264.964022 -3.810022
7945  7821.753000  7823.926085 -2.173085
3508  2751.985864  2752.208351 -0.222487
2002  2561.944811  2561.334579  0.610232

Test set results:
           Actual    Predicted   Residual
6252  6060.700000  6054.647343   6.052657
4684  3255.814455  3262.606992  -6.792537
1731  2844.703206  2846.164870  -1.461664
4742  3133.944906  3137.131507  -3.186601
4521  4125.009552  4124.506708   0.502844
6340  6329.840000  6319.615464  10.224536
576   3618.130693  3613.718978   4.411715
5202  7442.355000  7438.111543   4.243457
6363  7112.117000  7110.424327   1.692673
439   3691.595666  3693.343793  -1.748127
