<a href="https://colab.research.google.com/github/vidakpop/Hospital-visit-recommender/blob/main/Health_visit_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load the dataset from the provided URL
url = 'https://raw.githubusercontent.com/vidakpop/Health_Care-Kenya/main/Healthcare%20Dataset.csv'
data = pd.read_csv(url)



In [6]:
# Define the columns to drop (such as picture-related columns)
columns_to_drop = ['Your Picture', 'Your Picture_URL', '_id', '_uuid', '_submission_time', '_status',
                   '_submitted_by', '__version__', '_tags', '_index']

data = data.drop(columns=columns_to_drop)

# Define the target variable (predict whether there was a hospital visit)
data['Hospital_Visit'] = data['When was the last time you visited a hospital for medical treatment? (In Months)'].notna()
data = data.drop(columns=['When was the last time you visited a hospital for medical treatment? (In Months)'])

# Convert 'Age' to numeric (you may need to customize this based on your data)
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

# Define categorical and numerical columns
categorical_columns = ['Gender', 'Marital Status', 'Employment Status', 'Monthly Household Income',
                       'Have you ever had health insurance?', 'If yes, which insurance cover?',
                       'Did you have health insurance during your last hospital visit?',
                       'Have you ever had a routine check-up with a doctor or healthcare provider?',
                       'If you answered yes to the previous question, what time period (in years) do you stay before having your routine check-up?',
                       'Have you ever had a cancer screening (e.g. mammogram, colonoscopy, etc.)?',
                       'If you answered yes to the previous question, what time period (in years) do you stay before having your Cancer screening?']

numerical_columns = ['_Location_latitude', '_Location_longitude', '_Location_altitude',
                     '_Location_precision', 'Age']

# Split data into features and target variable
X = data.drop(columns=['Hospital_Visit'])
y = data['Hospital_Visit']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [7]:
# Create preprocessing pipelines for categorical and numerical data
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_columns),
        ('num', numerical_pipeline, numerical_columns)
    ])




In [8]:
# Create a Random Forest Classifier model
model = RandomForestClassifier(random_state=42)

# Create a full pipeline with data preprocessing and model
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the pipeline (including data preprocessing) to the training data
full_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = full_pipeline.predict(X_test)




In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.976461038961039
Classification Report:
              precision    recall  f1-score   support

       False       0.64      0.27      0.38        33
        True       0.98      1.00      0.99      1199

    accuracy                           0.98      1232
   macro avg       0.81      0.63      0.69      1232
weighted avg       0.97      0.98      0.97      1232



In [12]:
# Example: To predict a single instance
new_data = pd.DataFrame({
    'Gender': ['Male'],
    'Marital Status': ['Married'],
    'Employment Status': ['Employed'],
    'Monthly Household Income': ['5000-10000 Ksh'],
    'Have you ever had health insurance?': ['Yes'],
    'If yes, which insurance cover?': ['NHIF'],
    'Did you have health insurance during your last hospital visit?': ['Yes'],
    'Have you ever had a routine check-up with a doctor or healthcare provider?': ['Yes'],
    'If you answered yes to the previous question, what time period (in years) do you stay before having your routine check-up?': ['1'],
    'Have you ever had a cancer screening (e.g. mammogram, colonoscopy, etc.)?': ['No'],
    'If you answered yes to the previous question, what time period (in years) do you stay before having your Cancer screening?': [''],
    '_Location_latitude': [1.23],
    '_Location_longitude': [36.78],
    '_Location_altitude': [1500.0],
    '_Location_precision': [10.0],
    'Age': [30]
})

# Preprocess the new data and make a prediction
new_data_preprocessed = full_pipeline.named_steps['preprocessor'].transform(new_data)
prediction = full_pipeline.named_steps['model'].predict(new_data_preprocessed)

print(f'Predicted Hospital Visit: {prediction[0]}')


Predicted Hospital Visit: True
