In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from xgboost.testing.data import joblib
from sklearn.pipeline import Pipeline
import os
from datetime import datetime

In [18]:
# Import CSV files
hospital_info = pd.read_csv('hospital-info.csv')
not_yet_rated = pd.read_csv('not_yet_rated.csv')

In [19]:
# Remove demographic and unnecessary columns
columns_to_drop = [
    'Provider ID',
    'Hospital Name',
    'Address',
    'City',
    'State',
    'ZIP Code',
    'County Name',
    'Phone Number',
    'Hospital Ownership',
    'Emergency Services',
    'rating_group'
]

In [20]:
# Save ID data for later
id_data = hospital_info[['Hospital Name', 'Address']].copy()

In [21]:
# Keep only necessary columns for machine learning
ml_data = hospital_info.drop(columns=columns_to_drop)

In [22]:
# Convert hospital ratings to binary classification
binary_classification = False

if binary_classification:
    ml_data['Hospital overall rating'] = ml_data['Hospital overall rating'].apply(lambda x: 1 if x > 3 else 0)

In [23]:
# Separate predictors and target variable
X = ml_data.drop(columns=['Hospital overall rating'])
y = ml_data['Hospital overall rating']

In [24]:
# Split the data into train and test sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

Selected Features

In [25]:
svm_features = [
    'Mortality national comparison', 'Safety of care national comparison',
    'Readmission national comparison', 'Patient experience national comparison',
    'Effectiveness of care national comparison', 'Timeliness of care national comparison',
    'Efficient use of medical imaging national comparison', 'MORT_30_AMI_Score',
    'MORT_30_CABG_Score', 'MORT_30_COPD_Score', 'MORT_30_HF_Score',
    'MORT_30_PN_Score', 'MORT_30_STK_Score', 'READM_30_AMI_Score',
    'READM_30_CABG_Score', 'READM_30_COPD_Score', 'READM_30_HF_Score',
    'READM_30_HIP_KNEE_Score', 'READM_30_HOSP_WIDE_Score',
    'READM_30_PN_Score', 'READM_30_STK_Score', 'TIME_OP_21_Score',
    'TIME_OP_5_Score', 'EFF_EDV_Score', 'EFF_IMM_2_Score',
    'EFF_OP_20_Score', 'EFF_OP_22_Score', 'EFF_OP_4_Score',
    'EFF_PC_01_Score', 'EFF_STK_1_Score', 'EFF_STK_10_Score',
    'EFF_STK_2_Score', 'EFF_STK_4_Score', 'EFF_STK_5_Score',
    'EFF_STK_6_Score', 'EFF_VTE_1_Score', 'EFF_VTE_2_Score',
    'EFF_VTE_3_Score', 'EFF_VTE_5_Score', 'EFF_VTE_6_Score',
    'EXP_H_CLEAN_STAR_RATING_Score', 'EXP_H_COMP_1_STAR_RATING_Score',
    'EXP_H_COMP_2_STAR_RATING_Score', 'EXP_H_COMP_3_STAR_RATING_Score',
    'EXP_H_COMP_4_STAR_RATING_Score', 'EXP_H_COMP_5_STAR_RATING_Score',
    'EXP_H_COMP_6_STAR_RATING_Score', 'EXP_H_COMP_7_STAR_RATING_Score',
    'EXP_H_HSP_RATING_STAR_RATING_Score', 'EXP_H_QUIET_STAR_RATING_Score',
    'EXP_H_RECMND_STAR_RATING_Score', 'EXP_H_STAR_RATING_Score',
    'SAFETY_COMP_HIP_KNEE_Score', 'SAFETY_PSI_12_POSTOP_PULMEMB_DVT_Score',
    'SAFETY_PSI_13_POST_SEPSIS_Score', 'SAFETY_PSI_14_POSTOP_DEHIS_Score',
    'SAFETY_PSI_15_ACC_LAC_Score', 'SAFETY_PSI_3_ULCER_Score',
    'SAFETY_PSI_6_IAT_PTX_Score', 'SAFETY_PSI_7_CVCBI_Score',
    'SAFETY_PSI_90_SAFETY_Score', 'SAFETY_HAI_1_SIR_Score',
    'SAFETY_HAI_1a_SIR_Score', 'SAFETY_HAI_2_SIR_Score',
    'SAFETY_HAI_2a_SIR_Score', 'SAFETY_HAI_3_SIR_Score',
    'SAFETY_HAI_4_SIR_Score', 'SAFETY_HAI_5_SIR_Score',
    'SAFETY_HAI_6_SIR_Score', 'MED_OP_10_Score', 'MED_OP_11_Score',
    'MED_OP_13_Score', 'MED_OP_14_Score', 'MED_OP_8_Score',
    'MED_OP_9_Score'
]

In [26]:
X_train_selected = X_train[svm_features]
X_test_selected = X_test[svm_features]

In [52]:
save_new_model = True

if save_new_model:

    params = {
        "C": 0.8895813093302738,
        "l1_ratio": 0.9803599632901754,
        "max_iter": 2000, # 828
        "solver": 'saga',
        "penalty": 'elasticnet',
        "random_state": 42,
        "n_jobs": -1
    }
    # Create pipeline
    hospital_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', LogisticRegression(**params))
    ])
    
    # Train the best model on the full training data
    hospital_pipeline.fit(X_train_selected, y_train)
    
    # Save the trained model
    joblib.dump(hospital_pipeline, 'ml_model.pkl')
    
    accuracy = hospital_pipeline.score(X_test_selected, y_test)
    
    print(f"Model accuracy: {round(100*accuracy, 2)}%")

Model accuracy: 0.91%


In [28]:
# Ensure that 'not_yet_rated' contains all the selected features
not_yet_rated_selected = not_yet_rated[svm_features]

# Load the ML model
hospital_pipeline = joblib.load('ml_model.pkl')

# Make predictions
predictions = hospital_pipeline.predict(not_yet_rated_selected)

# Add predictions to the DataFrame
not_yet_rated['Predicted Hospital overall rating'] = predictions

# Display the predictions
not_yet_rated[['Provider ID', 'Predicted Hospital overall rating']]

Unnamed: 0,Provider ID,Predicted Hospital overall rating
0,520139,4
1,520189,4
2,370029,3
3,370032,3
4,370036,3
5,370037,3


In [39]:
# Function to generate recommendations for a hospital
def generate_recommendations(hospital_id, not_yet_rated, feature_importances, top_n=5):
    # Get the top N important features
    top_features = feature_importances.head(top_n)['Feature'].values
    
    # Get the data for the specific hospital
    hospital_data = not_yet_rated[not_yet_rated['Provider ID'] == hospital_id]
    
    # Compare with hospitals having higher ratings
    higher_rated_hospitals = not_yet_rated[not_yet_rated['Predicted Hospital overall rating'] >= 4]
    
    recommendations = {}
    
    for feature in top_features:
        # Get the mean value of the feature for higher-rated hospitals
        mean_value = higher_rated_hospitals[feature].mean()
        
        # Get the hospital's current value for the feature
        current_value = hospital_data[feature].values[0]
        
        # Generate recommendation if the current value is lower than the mean value
        if current_value < mean_value:
            recommendations[feature] = {
                'current_value': current_value,
                'recommended_value': mean_value,
                'difference': mean_value - current_value
            }
    
    return recommendations

In [40]:
# Apply recommendations to the dataset
def apply_recommendations(hospital_id, not_yet_rated, recommendations):
    modified_data = not_yet_rated.copy()
    for feature, details in recommendations.items():
        modified_data.loc[modified_data['Provider ID'] == hospital_id, feature] = details['recommended_value']
    return modified_data

In [48]:
# Load or create the version history DataFrame
def load_version_history(file_path='data_versions.csv'):
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    else:
        return pd.DataFrame(columns=['Version', 'Reference_ID', 'Date', 'Configurations', 'Data'])

In [49]:
# Save version history to CSV
def save_version_history(version_history, file_path='data_versions.csv'):
    version_history.to_csv(file_path, index=False)

In [50]:
# Main function to handle recommendations and saving the new dataset
def process_hospital_data(hospital_id, not_yet_rated, feature_importances, pipeline, version_history):
    recommendations = generate_recommendations(hospital_id, not_yet_rated, feature_importances)
    
    # Print recommendations
    print(f"Recommendations for Hospital ID {hospital_id}:")
    for feature, details in recommendations.items():
        print(f"- {feature}: Increase from {details['current_value']} to {details['recommended_value']} (Difference: {details['difference']:.2f})")
    
    # Apply recommendations and create a new version of the dataset
    modified_data = apply_recommendations(hospital_id, not_yet_rated, recommendations)
    
    # Generate a new version number
    new_version = version_history['Version'].max() + 1 if not version_history.empty else 1
    
    # Get current date and time
    current_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Save the new version to the version history DataFrame
    new_entry = {
        'Version': new_version,
        'Reference_ID': hospital_id,
        'Date': current_date,
        'Configurations': params,
        'Data': modified_data.to_dict()
    }
    version_history.loc[len(version_history)] = new_entry
    
    # Save the updated version history to CSV
    save_version_history(version_history)
    
    return modified_data, new_version

In [44]:
hospital_id = 370037

In [51]:
# Export feature_importances_ / coef_ attribute
importances = np.abs(hospital_pipeline.named_steps['regressor'].coef_)[0]
feature_names = svm_features

# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Load existing version history or create a new one
version_history = load_version_history()

# Example: Process hospital data and save new version
modified_data, new_version = process_hospital_data(hospital_id, not_yet_rated, feature_importances, hospital_pipeline, version_history)

# Calculate the new prediction for the modified hospital
new_prediction = hospital_pipeline.predict(modified_data[modified_data['Provider ID'] == hospital_id][svm_features])
print(f"New predicted rating for Hospital ID {hospital_id}: {new_prediction[0]}")

Recommendations for Hospital ID 370037:
- MORT_30_PN_Score: Increase from -15.4 to -15.100000000000001 (Difference: 0.30)
- MORT_30_HF_Score: Increase from -12.8 to -12.7 (Difference: 0.10)
New predicted rating for Hospital ID 370037: 3


1. Mittelwert und Minimum jedes Features berechnen
2. Vergleich des Hospitals mit diesen Daten: Relative Abweichungen vom Mittelwert anzeigen und ranken, Unterschreitungen des Minimalwertes ausweisen.
3. Berechnung des Hospitals mit Empfehlungswert durchführen und neue Bewertung prüfen.
4. Ähnlichkeit von Datensätzen definieren, Ähnlichkeiten bestimmen, ähnlichsten Datensatz herausfinden und vergleichen, Stellschrauben definieren

Davor: Analyse der Wechselwirkungen (Interaktionen) zwischen verschiedenen Merkmalen