In [44]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [45]:
# Load dataset
data = pd.read_csv('medical_clean.csv')

In [46]:
# Make sure the 'Initial_days' column exists in the dataset
if 'Initial_days' not in data.columns:
    print("The 'Initial_days' column is missing in the dataset. Please check the dataset.")
    exit()

In [47]:
# Handle categorical variables using LabelEncoder
encoder = LabelEncoder()
categorical_columns = ['City', 'State', 'County', 'Area', 'TimeZone', 'Job', 'Marital', 'Gender', 'ReAdmis']
for column in categorical_columns:
    data[column] = encoder.fit_transform(data[column].astype(str))

In [48]:
# Drop non-numeric columns
numeric_columns = data.select_dtypes(include=np.number).columns
data = data[numeric_columns]

In [49]:
# Define the target variable and features
y = data['Initial_days']
X = data.drop(columns=['Initial_days'])

In [50]:
# Make sure the 'Initial_days' column exists in the dataset
if 'Initial_days' not in data.columns:
    print("The 'Initial_days' column is missing in the dataset. Please check the dataset.")
    exit()

# Handle categorical variables using LabelEncoder
encoder = LabelEncoder()
categorical_columns = ['City', 'State', 'County', 'Area', 'TimeZone', 'Job', 'Marital', 'Gender', 'ReAdmis']
for column in categorical_columns:
    data[column] = encoder.fit_transform(data[column].astype(str))

print("Categorical variables encoded:\n", data[categorical_columns].head())

# Drop non-numeric columns
numeric_columns = data.select_dtypes(include=np.number).columns
data = data[numeric_columns]

print("Data with only numeric columns:\n", data.head())

# Define the target variable and features
y = data['Initial_days']
X = data.drop(columns=['Initial_days'])

print("Target variable 'Initial_days':\n", y.head())
print("Feature matrix X:\n", X.head())

Categorical variables encoded:
    City  State  County  Area  TimeZone  Job  Marital  Gender  ReAdmis
0   752      1    1580     1        19  421        0       1        0
1  2483     51    1289     2        19   29        1       0        0
2  4471     37    1554     1        19  633        4       0        0
3  3093     16     574     1        19   87        1       1        0
4  5351     41    1348     0         8  207        4       0        0
Data with only numeric columns:
    CaseOrder  City  State  County    Zip       Lat       Lng  Population  \
0          1   752      1    1580  35621  34.34960 -86.72508        2951   
1          2  2483     51    1289  32446  30.84513 -85.22907       11303   
2          3  4471     37    1554  57110  43.54321 -96.63772       17125   
3          4  3093     16     574  56072  43.89744 -93.51479        2162   
4          5  5351     41    1348  23181  37.59894 -76.88958        5287   

   Area  TimeZone  ...  TotalCharge  Additional_charges  I

In [51]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (8000, 31)
X_test shape: (2000, 31)
y_train shape: (8000,)
y_test shape: (2000,)


In [53]:
# Print the first few rows of the training and testing sets
print("\nX_train first few rows:")
print(X_train.head())


X_train first few rows:
      CaseOrder  City  State  County    Zip       Lat       Lng  Population  \
9254       9255  5853     30    1091  73727  36.50569 -98.03713         203   
1561       1562  4046     29     893  43805  40.39331 -81.97257          81   
1670       1671  4910     30    1307  73460  34.28710 -96.66144        4609   
6087       6088  1983     38    1364  37902  35.96375 -83.92024        2349   
6669       6670  4208     36     247  29479  33.37990 -79.88696        6758   

      Area  TimeZone  ...  TotalCharge  Additional_charges  Item1  Item2  \
9254     1        19  ...  7749.506000          7889.19000      4      3   
1561     0         8  ...  3303.446900         18282.42091      5      6   
1670     1        19  ...  2330.569902          9206.08193      4      4   
6087     0         8  ...  7980.623000          9271.35300      5      6   
6669     1         8  ...  7284.532000          4874.09400      2      2   

      Item3  Item4  Item5  Item6  Item7  It

In [54]:
print("\nX_test first few rows:")
print(X_test.head())


X_test first few rows:
      CaseOrder  City  State  County    Zip       Lat        Lng  Population  \
6252       6253    45      2    1464  30628  34.02972  -83.21264        6874   
4684       4685  5726     45    1356  80807  39.31882 -102.22605        5112   
1731       1732  3304      8     242  66517  39.11263  -96.70688        1745   
4742       4743  3759     28     430  11776  40.91361  -73.04636       24666   
4521       4522  4579     11     643   1550  42.06039  -72.03377       16894   

      Area  TimeZone  ...  TotalCharge  Additional_charges  Item1  Item2  \
6252     0         8  ...  6060.700000         11244.22000      3      2   
4684     1        20  ...  3255.814455         18972.97875      4      3   
1731     0        19  ...  2844.703206         24921.47443      1      2   
4742     1         8  ...  3133.944906         24726.85707      4      3   
4521     2         8  ...  4125.009552          9005.51169      4      4   

      Item3  Item4  Item5  Item6  Item

In [55]:
print("\ny_train first few rows:")
print(y_train.head())


y_train first few rows:
9254    63.159110
1561    15.569822
1670     4.016331
6087    63.776090
6669    57.938180
Name: Initial_days, dtype: float64


In [56]:
print("\ny_test first few rows:")
print(y_test.head())


y_test first few rows:
6252    48.634250
4684    12.062901
1731     3.766619
4742    12.612046
4521    16.738161
Name: Initial_days, dtype: float64


In [57]:
# Build and fit a random forests model
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [58]:
# Make predictions on the test data
y_pred = rf_regressor.predict(X_test)

In [59]:
# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)

In [60]:
# Print the mean squared error (MSE)
print(f'Mean Squared Error (MSE): {mse:.2f}')

Mean Squared Error (MSE): 10.72


In [61]:
# Save the cleaned and analyzed dataset to an Excel file
data.to_excel("cleaned_random_forests_model.xlsx", index=False)

In [62]:
import openpyxl

# Load the Excel file
wb = openpyxl.load_workbook('cleaned_random_forests_model.xlsx')

# Select the worksheet to read
ws = wb.active

# Print the first few rows of the worksheet
for row in ws.iter_rows(min_row=1, max_row=5, values_only=True):
    print(row)

('CaseOrder', 'City', 'State', 'County', 'Zip', 'Lat', 'Lng', 'Population', 'Area', 'TimeZone', 'Job', 'Children', 'Age', 'Income', 'Marital', 'Gender', 'ReAdmis', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 'Initial_days', 'TotalCharge', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8')
(1, 752, 1, 1580, 35621, 34.3496, -86.72508, 2951, 1, 19, 421, 1, 53, 86575.93, 0, 1, 0, 19.1414657, 6, 0, 0, 10.58576971, 3726.70286, 17939.40342, 3, 3, 2, 2, 4, 3, 3, 4)
(2, 2483, 51, 1289, 32446, 30.84513, -85.22907, 11303, 2, 19, 29, 3, 51, 46805.99, 1, 0, 0, 18.9403523, 4, 2, 1, 15.12956221, 4193.190458, 17612.99812, 3, 4, 3, 4, 4, 4, 3, 3)
(3, 4471, 37, 1554, 57110, 43.54321, -96.63772, 17125, 1, 19, 633, 3, 53, 14370.14, 4, 0, 0, 18.05750734, 4, 1, 0, 4.772177208, 2434.234222, 17505.19246, 2, 4, 4, 4, 3, 4, 3, 3)
(4, 3093, 16, 574, 56072, 43.89744, -93.51479, 2162, 1, 19, 87, 0, 78, 39741.49, 1, 1, 0, 16.57685793, 4, 1, 0, 1.714879175

In [10]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
import openpyxl

# Load the Excel file
wb = openpyxl.load_workbook('cleaned_random_forests_model.xlsx')

# Create random forest classifier
rfc = RandomForestClassifier()

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(rfc, param_grid, cv=5)
grid_search.fit(X, y)

# Print results
print('Best hyperparameters:', grid_search.best_params_)
print('Best accuracy:', grid_search.best_score_)


Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Best accuracy: 0.9666356155876418
