## Stroke Prediction

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier  

# Read dataset
df_train = pd.read_csv(r'..\Datasets\healthcare-dataset-stroke-data.csv')

# Removing irrelevant columns
df_train.drop(columns = 'id', axis = 1, inplace = True)

# Handling null values
mean_bmi = df_train.groupby(['gender', 'age']).mean()['bmi']
mean_bmi = np.around(mean_bmi, decimals = 3)

def fill_bmi(df_train, mean_bmi):
    if math.isnan(df_train['bmi']):
        return mean_bmi[df_train['gender']][df_train['age']]
    else:
        return df_train['bmi']
    
df_train['bmi'] = df_train.apply(fill_bmi, axis = 1, args = (mean_bmi, ))
df_train['bmi'].iloc[2030] = mean_bmi['Female'][0.48]
df_train.drop(index = 3116, inplace = True)

# One hot encoding
df_objects = df_train.select_dtypes(include = 'object')
df_train = pd.get_dummies(df_train, columns= df_objects.columns, dtype = 'int')

# Model 
X = df_train.drop('stroke', axis = 1)
y = df_train['stroke']

# Oversampling 
smk = SMOTETomek(random_state = 2, sampling_strategy = 'minority')
x_res, y_res = smk.fit_resample(X, y)

# train test split
X_train, X_test, y_train, y_test = train_test_split(x_res, y_res, test_size = 0.3, random_state = 17)

#Logistic Regression

logistic = LogisticRegression(max_iter=10000)
logistic.fit(X_train, y_train)
pred = logistic.predict(X_test)

#Calculating accuracy
score = np.around(accuracy_score(y_test, pred), decimals = 3) *100


# Tuple of model and score
model_score = (logistic, score)

# Dumping model and score
pickle.dump(model_score, open('Stroke_Prediction.sav', 'wb'))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


{'Accuracy': 0.969, 'Precision': 0.971, 'Recall': 0.969, 'F1- score': 0.969, 'ROC AUC Score': 0.969, 'RMSE': 0.175}


## Heart Disease

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier  

# Reading dataset
df_train = pd.read_csv(r'..\Datasets\heart.csv')

# Rename columns for better readability
df_train.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_pressure',
                    'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'slope',
                    'num_major_vessels', 'thal', 'target']

# One hot encoding
df_train = pd.get_dummies(data = df_train, columns = ['sex', 'chest_pain_type', 'fasting_blood_pressure', 'rest_ecg', 
                                           'exercise_induced_angina', 'slope', 'num_major_vessels', 'thal'], dtype= 'int')

# Training model
X = df_train.drop('target', axis = 1)
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)


# Logistic Regression
logistic = LogisticRegression(max_iter=10000)
logistic.fit(X_train, y_train)
pred = logistic.predict(X_test)

#Calculating accuracy
score = np.around(accuracy_score(y_test, pred), decimals = 3) *100


# Tuple of model and score
model_score = (logistic, score)

# Dumping model and score
pickle.dump(model_score, open('Heart_Disease.sav', 'wb'))

## Liver Disease

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
 

# Reading Dataset
df_train = pd.read_csv(r'..\Datasets\indian_liver_patient.csv')

# There are 4 null values in one column of the dataset. 
# Adding mean value to the null values
mean_ratio = df_train['Albumin_and_Globulin_Ratio'].mean()

def fill_ratio(df_train, mean_ratio):
    if math.isnan(df_train['Albumin_and_Globulin_Ratio']):
        return mean_ratio
    else:
        return df_train['Albumin_and_Globulin_Ratio']
    
df_train['Albumin_and_Globulin_Ratio'] = df_train.apply(fill_ratio, axis = 1, args = (mean_ratio, ))

# Renaming  column for better readability
df_train.rename(columns = {'Dataset': 'Response'}, inplace = True)

# converting categorical values to numerical values 
df_train = pd.get_dummies(df_train, columns = ['Gender'], dtype= 'int')

# Model
X = df_train.drop('Response', axis = 1)
y = df_train['Response']

# Oversampling the dataset
smk = SMOTETomek(random_state = 1, sampling_strategy = 'minority')
x_res, y_res = smk.fit_resample(X, y)

# Scaling 
standardScaler = StandardScaler()
scale_columns = ['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 
                 'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio']
df_train[scale_columns] = standardScaler.fit_transform(df_train[scale_columns])


X_train, X_test, y_train, y_test = train_test_split(x_res, y_res, test_size = 0.3, random_state = 50)

# Decision Tree
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)

#Calculating accuracy
score = np.around(accuracy_score(y_test, pred), decimals = 3) *100


# Tuple of model and score
model_score = (dt_clf, score)

# Dumping model and score
pickle.dump(model_score, open('Liver_Disease.sav', 'wb'))


In [None]:
# fig, axes = plt.subplots(2,3, figsize=  (15,10))
# count = 0
# for i in range(0, 2):
#     for j in range(0, 3):
#         dc_scores.iloc[i].plot(ax = axes[i][j], kind = 'bar')
#         axes[i][j].set_title(dc_scores.index[count])
#         axes[i][j].tick_params(axis='x', labelrotation = 45) 
#         count += 1
# plt.tight_layout()
# plt.show()