#Importing necessary library packages

In [23]:
#!pip install spacy

In [25]:
import nltk
import spacy
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse
import plotly.express as px
from tabulate import tabulate
from datetime import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from nltk.tokenize import word_tokenize
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score

TypeError: ForwardRef._evaluate() missing 1 required keyword-only argument: 'recursive_guard'

#Loading the dataset

In [None]:
Healthcare_data = pd.read_csv('/content/Healthcare.csv')
Healthcare_data.head()

#EDA

#Display the data information

In [None]:
Healthcare_data.info()

#Checking for missing values

In [None]:
missing_values = Healthcare_data.isnull().sum()
print(missing_values)

#Summary Statistic

In [None]:
Healthcare_data.describe()

#Visualization

# Histogram for age distribution

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(Healthcare_data['age'], bins=20, kde=True)
plt.title('Age Distribution of Patients')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


#Top 5 Diseases by Distribution

In [None]:
plt.figure(figsize=(8, 8))
top_5_health_issues = Healthcare_data['disease_name'].value_counts().nlargest(5)
plt.pie(top_5_health_issues, labels=top_5_health_issues.index, autopct='%1.1f%%', startangle=140)
plt.title('Top 5 Diseases by Distribution')
plt.show()


#Top 10 Specializations of Doctors

In [None]:
plt.figure(figsize=(10, 8))
top_10_specializations = Healthcare_data['specialization'].value_counts().nlargest(10)
sns.countplot(y='specialization', data=Healthcare_data, order=top_10_specializations.index, palette='rainbow')

for i, count in enumerate(top_10_specializations):
    plt.text(count + 0.1, i, str(count), ha='center', va='bottom', fontsize=10, color='black')

plt.title('Top 10 Specializations of Doctors')
plt.xlabel('Count')
plt.ylabel('Specialization')
plt.show()

#Average Years of Experience by Specialization (Top 20)

In [None]:
top_20_avg_years_experience = Healthcare_data.groupby('specialization')['years_of_experience'].mean().nlargest(20)

plt.figure(figsize=(12, 8))
sns.lineplot(x=top_20_avg_years_experience.values, y=top_20_avg_years_experience.index, marker='o', color='b')
plt.title('Average Years of Experience by Specialization (Top 20)')
plt.xlabel('Average Years of Experience')
plt.ylabel('Specialization')
plt.grid(True)

for x, y in zip(top_20_avg_years_experience.values, top_20_avg_years_experience.index):
    plt.text(x, y, f'{x:.1f}', ha='center', va='bottom', fontsize=10, color='black')

plt.show()

#Word Cloud from Healthcare

In [None]:
notes_text = ' '.join(Healthcare_data['notes'].dropna())

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(notes_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud from Healthcare')
plt.axis('off')
plt.show()


#Correlation Matrix

In [None]:
numerical_data = Healthcare_data[['age', 'years_of_experience']]
correlation_matrix = numerical_data.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 12})
plt.title('Correlation Matrix')
plt.show()


#Age vs. Doctor Experience

In [None]:
fig = px.scatter(Healthcare_data, x='age', y='years_of_experience', color='gender',
                 hover_data=['patient_name', 'specialization'],
                 title='Age vs. Doctor Experience')
fig.show()


#Appointments Over Time

In [None]:
appointments_per_date = Healthcare_data['appointment_date'].value_counts().sort_index().reset_index()
appointments_per_date.columns = ['Appointment_Date', 'Number_of_Appointments']

fig = px.line(appointments_per_date, x='Appointment_Date', y='Number_of_Appointments',
              title='Appointments Over Time')
fig.update_xaxes(title='Appointment Date')
fig.update_yaxes(title='Number of Appointments')
fig.show()


In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

notes = Healthcare_data['notes'].tolist()
tokenized_notes = [word_tokenize(note) for note in notes]


In [None]:
nlp = spacy.load('en_core_web_sm')

notes = Healthcare_data['notes'].tolist()
processed_notes = [nlp(note) for note in notes]

for doc in processed_notes:
    for ent in doc.ents:
        print(ent.text, ent.label_)


In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

text = Healthcare_data['notes'][0]
labels = [1]

inputs = tokenizer(text, return_tensors='pt')

outputs = model(**inputs, labels=torch.tensor(labels))
print(outputs.logits)


#Machine Learning Model of Gradient Boosting Classifier

In [None]:
features = ['age', 'gender', 'specialization', 'years_of_experience', 'test_result']
Healthcare_data['gender'] = pd.factorize(Healthcare_data['gender'])[0]

X = Healthcare_data[features]
y = Healthcare_data['gender']

categorical_features = ['specialization', 'test_result']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=None))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

pipeline.fit(X_train, y_train)

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy of Gradient Boosting Classifier: {accuracy:.2f}')

print('Classification Report of Gradient Boosting Classifier:')
print(classification_report(y_test, y_pred))


#Confusion Matrix of Gradient Boosting Classifier

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='viridis', fmt='d', xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix of Gradient Boosting Classifier')
plt.show()

#ROC Curve of Gradient Boosting Classifier

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='purple', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of Gradient Boosting Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


#Machine Learning Model of Logistic Regression

In [None]:
features = ['age', 'gender', 'specialization', 'years_of_experience', 'test_result']

Healthcare_data['gender'] = pd.factorize(Healthcare_data['gender'])[0]

X = Healthcare_data[features]
y = Healthcare_data['gender']

categorical_features = ['specialization', 'test_result']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

pipeline.fit(X_train, y_train)

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of Logistic Regression: {accuracy:.2f}')

print('Classification Report of Logistic Regression:')
print(classification_report(y_test, y_pred))


#Confusion Matrix of Logistic Regression

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Oranges', fmt='d', xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix of Logistic Regression')
plt.show()

#ROC Curve of  Logistic Regression

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='orange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of  Logistic Regression')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

#Machine Learning Model of Random Forest Classifier

In [None]:
features = ['age', 'gender', 'specialization', 'years_of_experience', 'test_result']

Healthcare_data['gender'] = pd.factorize(Healthcare_data['gender'])[0]

X = Healthcare_data[features]
y = Healthcare_data['gender']

categorical_features = ['specialization', 'test_result']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=None))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

pipeline.fit(X_train, y_train)

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of Random Forest Classifier: {accuracy:.2f}')

print('Classification Report of Random Forest Classifier:')
print(classification_report(y_test, y_pred))


#Confusion Matrix of Random Forest Classifier

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=pipeline.classes_, yticklabels=pipeline.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix of Random Forest Classifier')
plt.show()

#ROC Curve of Random Forest Classifier

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of Random Forest Classifier')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

#Machine Learning Model of Neural Network

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# Load the data
Healthcare_data = pd.read_csv('/content/Healthcare.csv')

# Encoding categorical variables
label_encoders = {}
for column in ['gender', 'specialization', 'test_result']:
    le = LabelEncoder()
    Healthcare_data[column] = le.fit_transform(Healthcare_data[column])
    label_encoders[column] = le

# Features and target
features = ['age', 'gender', 'specialization', 'years_of_experience', 'test_result']
X = Healthcare_data[features]
y = Healthcare_data['gender']
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model
model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=8, batch_size=16, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

# Predictions
y_pred_proba = model.predict(X_test_scaled).flatten()
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of Neural Network: {accuracy:.2f}')
print('Classification Report of Neural Network:')
print(classification_report(y_test, y_pred))

#Confusion Matrix of Neural Network

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix of Neural Network:')
print(conf_matrix)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Female', 'Male'], yticklabels=['Female', 'Male'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix of Neural Network')
plt.show()

#ROC Curve of Neural Network

In [None]:
from sklearn.metrics import roc_curve, auc
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

#ARIMA Model

In [None]:
Healthcare_data['appointment_date'] = pd.to_datetime(Healthcare_data['appointment_date'])

monthly_appointments = Healthcare_data.resample('M', on='appointment_date').size()

plt.figure(figsize=(12, 6))
plt.plot(monthly_appointments, marker='o', linestyle='-', color='b', label='Monthly Appointments')
plt.title('Monthly Appointments')
plt.xlabel('Month')
plt.ylabel('Number of Appointments')
plt.grid(True)
plt.legend()
plt.show()

train_data = monthly_appointments.iloc[:-6]
test_data = monthly_appointments.iloc[-6:]

model = ARIMA(train_data, order=(1, 1, 1))
model_fit = model.fit()

forecast = model_fit.forecast(steps=len(test_data))

print("Forecasted Appointments:")
print(forecast)

rmse = np.sqrt(mean_squared_error(test_data, forecast))
print(f"RMSE: {rmse}")
