In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# Load the dataset
file_path = 'LengthOfStay.csv'  # Replace with your file path
df = pd.read_csv(file_path)



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
# Step 1: Data Preprocessing

# Convert date columns to datetime format
df['vdate'] = pd.to_datetime(df['vdate'], errors='coerce')
df['discharged'] = pd.to_datetime(df['discharged'], errors='coerce')

# Create new features from the date columns (e.g., year, month)
df['vdate_year'] = df['vdate'].dt.year
df['vdate_month'] = df['vdate'].dt.month
df['discharged_year'] = df['discharged'].dt.year
df['discharged_month'] = df['discharged'].dt.month

# Drop the original date columns
df = df.drop(columns=['vdate', 'discharged'])

# Encode categorical variables
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['rcount'] = label_encoder.fit_transform(df['rcount'])
df['facid'] = label_encoder.fit_transform(df['facid'])



KeyError: 'vdate'

In [4]:
# Step 2: Handling Missing Values in Target Variable

# Drop rows where the target variable 'lengthofstay' is NaN
df = df.dropna(subset=['lengthofstay'])

# Step 3: Splitting Data into Features and Target

# Separate features and target variable
X = df.drop(columns=['lengthofstay', 'eid'])  # Features (excluding 'eid' and target)
y = df['lengthofstay']  # Target



In [5]:
# Step 4: Train-Test Split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:
# Step 5: Model Training and Testing

# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.40142654499999997


In [7]:
from sklearn.metrics import r2_score
import numpy as np

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mse)

print(f'R-squared: {r2}')
print(f'Root Mean Squared Error: {rmse}')


R-squared: 0.926843051265384
Root Mean Squared Error: 0.6335823111482832


In [3]:
# Step 6: Function to make predictions based on specific columns
def predict_length_of_stay(model, input_data, feature_columns):
    """Predict the length of stay based on the specified input features."""
    if any(col not in input_data.columns for col in feature_columns):
        raise ValueError("One or more specified columns are not in the input data.")

    # Select the relevant features from the input data
    input_features = input_data[feature_columns]

    # Make predictions using the model
    predictions = model.predict(input_features)

    return predictions

# Specify the complete list of feature columns
feature_columns = [
    'rcount', 'gender', 'dialysisrenalendstage', 'asthma', 'irondef',
    'pneum', 'substancedependence', 'psychologicaldisordermajor',
    'depress', 'psychother', 'fibrosisandother', 'malnutrition',
    'hemo', 'hematocrit', 'neutrophils', 'sodium', 'glucose',
    'bloodureanitro', 'creatinine', 'bmi', 'pulse', 'respiration',
    'secondarydiagnosisnonicd9', 'facid', 'vdate_year',
    'vdate_month', 'discharged_year', 'discharged_month'
]

# Create a new DataFrame for a single patient prediction with all necessary columns
input_data = pd.DataFrame({
    'rcount': [1],  # Replace with actual value for one patient
    'gender': [1],  # Replace with actual gender encoded value for one patient
    'dialysisrenalendstage': [0],  # Replace with actual value for one patient
    'asthma': [1],  # Replace with actual value for one patient
    'irondef': [0],  # Replace with actual value for one patient
    'pneum': [0],  # Replace with actual value for one patient
    'substancedependence': [0],  # Replace with actual value for one patient
    'psychologicaldisordermajor': [1],  # Replace with actual value for one patient
    'depress': [0],  # Replace with actual value for one patient
    'psychother': [0],  # Replace with actual value for one patient
    'fibrosisandother': [1],  # Replace with actual value for one patient
    'malnutrition': [0],  # Replace with actual value for one patient
    'hemo': [12],  # Replace with actual value for one patient
    'hematocrit': [40],  # Replace with actual value for one patient
    'neutrophils': [6.5],  # Replace with actual value for one patient
    'sodium': [140],  # Replace with actual value for one patient
    'glucose': [90],  # Replace with actual value for one patient
    'bloodureanitro': [0.5],  # Replace with actual value for one patient
    'creatinine': [1.0],  # Replace with actual value for one patient
    'bmi': [22.5],  # Replace with actual value for one patient
    'pulse': [72],  # Replace with actual value for one patient
    'respiration': [16],  # Replace with actual value for one patient
    'secondarydiagnosisnonicd9': [0],  # Replace with actual value for one patient
    'facid': [5],  # Replace with actual facid encoded value for one patient
    'vdate_year': [2023],
    'vdate_month': [10],
    'discharged_year': [2023],
    'discharged_month': [10],
})

# Make predictions for a single patient
predicted_length_of_stay = predict_length_of_stay(model, input_data, feature_columns)
print("Predicted Length of Stay:", predicted_length_of_stay[0])  # Output the prediction for the single patient


NameError: name 'model' is not defined

In [4]:
import joblib

# Assuming your trained model is stored in a variable called `model`
joblib.dump(model, 'length_of_stay_model.pkl')
joblib.dump(model, 'length_of_stay_model_compressed.pkl', compress=3)



NameError: name 'model' is not defined