In [1]:
# Import all required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# Import the dataset

data = pd.read_excel('Health_Care_Data_train.xlsx')

In [None]:
# Get the number of rows and number of columns of the dataset

data.shape

In [None]:
# obtain some statistics about the given data

data.describe()

In [None]:
# check the number of NA values present in each column of the dataset

data.isna().sum()

In [None]:
# Display maximum of 50 columns

pd.set_option('display.max_columns', 50)

In [None]:
# Replace the NA values in the dataset with the mean and then round the values

# data = data.fillna(data.mean())
# data = np.round(data)

In [3]:
# Replace the columns Specialty with a one-hot encoded version
# Group column is dropped

dummy = pd.get_dummies(data['Specialty'])
df = pd.concat([dummy, data], axis = 1)
df.drop(['Specialty', 'Group'], axis = 1, inplace=True)
# df.head()

In [4]:
# Replace nan values with the median value

from sklearn.impute import SimpleImputer
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
df.iloc[:,2:] = imp_median.fit_transform(df.iloc[:,2:])
# df

In [None]:
np.unique(df['q3'])

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
# Define a function to plot the correlation matrix

def plotCorrelationMatrix(df, graphWidth):
    filename = 'People Analytics'
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [None]:
# Calling the above function with the required parameters

plotCorrelationMatrix(df, 15)

In [5]:
# Include all columns excepts 'q58' as the input

x=df.iloc[:, :-1]
x

Unnamed: 0,Non_MD,Other,Primary_Care,Specialist,Surgical,physician,verysat,q1,q2,q3,...,q46,q47,q48,q50,q51,q53,q54,q55,q56,q57
0,0,1,0.0,0.0,0.0,1.0,0.0,5.0,2.0,2.0,...,5.0,2.0,2.0,3.0,3.0,5.0,5.0,5.0,3.0,3.0
1,0,1,0.0,0.0,0.0,1.0,0.0,5.0,4.0,4.0,...,4.0,4.0,4.0,3.0,3.0,5.0,4.0,1.0,3.0,3.0
2,0,1,0.0,0.0,0.0,1.0,0.0,5.0,5.0,4.0,...,2.0,4.0,4.0,2.0,3.0,2.0,3.0,3.0,5.0,3.0
3,0,1,0.0,0.0,0.0,1.0,1.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,1.0,1.0,5.0,5.0,5.0
4,0,0,1.0,0.0,0.0,1.0,0.0,3.0,1.0,1.0,...,4.0,1.0,5.0,3.0,3.0,5.0,5.0,1.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0,0,0.0,1.0,0.0,1.0,1.0,4.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,2.0,1.0,5.0,5.0,5.0
1096,0,0,0.0,0.0,1.0,1.0,0.0,5.0,4.0,2.0,...,3.0,3.0,4.0,4.0,4.0,3.0,2.0,3.0,3.0,4.0
1097,0,0,0.0,0.0,1.0,1.0,1.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,1.0,1.0,5.0,5.0,5.0
1098,0,0,0.0,0.0,1.0,1.0,0.0,5.0,4.0,4.0,...,4.0,5.0,5.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0


In [6]:
# Set column 'q58' as the output

y = df.q58
y

0       3.0
1       2.0
2       2.0
3       1.0
4       3.0
       ... 
1095    2.0
1096    2.0
1097    1.0
1098    2.0
1099    3.0
Name: q58, Length: 1100, dtype: float64

In [7]:
# split into training and test dataset in an 80:20 ratio

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [None]:
x_train.shape

In [None]:
y_train.shape

## K Nearest Neighbors

In [None]:
# Define the model and print the accuracies of training and testing data in case of K Nearest Neighbors

# KNeighborsClassifier(algorithm = 'auto', leaf_size = 30, metric = 'euclidean', metric_params = None, n_jobs = 1, n_neighbors = 13, p = 2, weights = 'uniform')

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(13).fit(x_train, y_train)
print(accuracy_score(y_train, classifier.predict(x_train)))
print(accuracy_score(y_test, classifier.predict(x_test)))

In [None]:
# Predicted value of test set results

y_pred = classifier.predict(x_test)

In [None]:
# Making the Confusion Matrix

cm1 = confusion_matrix(y_test, y_pred)
cm1

## Multiclass Classification

In [None]:
# One vs Rest Classifier
# Accuracy without feature scaling

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
clf = OneVsRestClassifier(SVC(max_iter = 10000)).fit(x_train, y_train)
print(accuracy_score(y_train, clf.predict(x_train)))
print(accuracy_score(y_test, clf.predict(x_test)))

In [None]:
# Predicted value of test set results

y_pred2 = clf.predict(x_test)

In [None]:
# Making the Confusion Matrix

cm2 = confusion_matrix(y_test, y_pred2)
cm2

In [None]:
# One vs One Classifier with 
# Accuracy without feature scaling

from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
clf1 = OneVsOneClassifier(LinearSVC(random_state=100, max_iter = 100000)).fit(x_train, y_train)
print(accuracy_score(y_train, clf1.predict(x_train)))
print(accuracy_score(y_test, clf1.predict(x_test)))

In [None]:
# Predicted value of test set results

y_pred3 = clf1.predict(x_test)

In [None]:
# Making the Confusion Matrix

cm3 = confusion_matrix(y_test, y_pred3)
cm3

## Feature Scaling

In [None]:
# Feature Scaling

sc_x = StandardScaler()
x_train1 = sc_x.fit_transform(x_train)
x_test1 = sc_x.transform(x_test)
np.round(x_train1)
np.round(x_test1)

In [None]:
classifier = KNeighborsClassifier(14).fit(x_train1, y_train)
print(accuracy_score(y_train, classifier.predict(x_train1)))
print(accuracy_score(y_test, classifier.predict(x_test1)))

In [None]:
clf = OneVsRestClassifier(SVC(max_iter = 10000)).fit(x_train1, y_train)
print(accuracy_score(y_train, clf.predict(x_train1)))
print(accuracy_score(y_test, clf.predict(x_test1)))

In [None]:
clf1 = OneVsOneClassifier(LinearSVC(random_state=100, max_iter = 100000)).fit(x_train1, y_train)
print(accuracy_score(y_train, clf1.predict(x_train1)))
print(accuracy_score(y_test, clf1.predict(x_test1)))

In [None]:
y_train

In [None]:
data.iloc[1,3]

In [None]:
np.unique(data['q58'])

In [None]:
math.sqrt(len(y_test))

## Grid Search

In [None]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV

parameters = [{'n_neighbors': [5,10, 11, 12, 13, 14, 15, 16, 17, 18,19, 20], 'leaf_size': [30, 40, 50, 60, 100], 'p': [1,2]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(x_train, y_train)

In [None]:
# Test accuracy after performing grid search

new_accuracy = grid_search.best_score_

In [None]:
new_accuracy

In [None]:
# The hyperparameters on which we obtain the highest accuracy

grid_search.best_params_

## Removing some features

In [None]:
# Dropping the features which have the maximum NaN (missing) values
# Also we can remove features that are highly correlated with each other as they may be redundant
# Keep the feature which is more correlated with the output variable 'q58'

x=x.drop(['q6', 'q11', 'q12', 'q13', 'q20'], axis=1)
x

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x,y,test_size=0.2)

In [None]:
# sc_x = StandardScaler()
# x_train1 = sc_x.fit_transform(x_train)
# x_test1 = sc_x.transform(x_test)
# np.round(x_train1)
# np.round(x_test1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors = 13, leaf_size = 30).fit(x_train1, y_train1)

In [None]:
accuracy_score(y_test1, classifier.predict(x_test1))

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [None]:
clf = OneVsRestClassifier(SVC(max_iter = 10000)).fit(x_train1, y_train1)
print(accuracy_score(y_train1, clf.predict(x_train1)))
print(accuracy_score(y_test1, clf.predict(x_test1)))

In [None]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

In [None]:
clf1 = OneVsOneClassifier(LinearSVC(random_state=100, max_iter = 100000)).fit(x_train1, y_train1)
print(accuracy_score(y_train1, clf1.predict(x_train1)))
print(accuracy_score(y_test1, clf1.predict(x_test1)))

## Logistic Regression

In [None]:
# import the logistic regression model

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 1, max_iter = 10000)
clf3 = logreg.fit(x_train1, y_train1)

In [None]:
clf3.predict(x_test1)

In [None]:
accuracy_score(y_test1, clf3.predict(x_test1))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(random_state = 1)
clf4 = gnb.fit(x_train1, y_train1)
y_pred = clf4.predict(x_test1)

In [None]:
y_pred

In [None]:
accuracy_score(y_test1, clf4.predict(x_test1))

In [None]:
# After feature scaling

In [None]:
logreg = LogisticRegression(max_iter = 10000)
clf3 = logreg.fit(x_train1, y_train)
accuracy_score(y_test, clf3.predict(x_test1))

In [None]:
gnb = GaussianNB()
clf4 = gnb.fit(x_train1, y_train)
y_pred = clf4.predict(x_test1)
accuracy_score(y_test, clf4.predict(x_test1))

## Support Vector Machine

In [None]:
from sklearn import svm
svc = svm.SVC()
clf5 = svc.fit(x_train, y_train)
accuracy_score(y_test, clf5.predict(x_test))

In [None]:
from sklearn import svm
svc = svm.SVC()
clf5 = svc.fit(x_train1, y_train1)
accuracy_score(y_test1, clf5.predict(x_test1))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

acc = []
acc_test = []
for i in range(50,1000,50):
    rf = RandomForestClassifier(i, max_leaf_nodes=32)
    rf.fit(x_train, y_train)
    acc.append(accuracy_score(y_train, rf.predict(x_train)))
    acc_test.append(accuracy_score(y_test, rf.predict(x_test)))
print(np.max(acc_test))
    
plt.figure()
plt.plot(range(50,1000,50), acc)
plt.plot(range(50,1000,50), acc_test)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

acc = []
acc_test = []
for i in range(50,1000,50):
    rf = RandomForestClassifier(i, max_leaf_nodes=32)
    rf.fit(x_train1, y_train1)
    acc.append(accuracy_score(y_train1, rf.predict(x_train1)))
    acc_test.append(accuracy_score(y_test1, rf.predict(x_test1)))
print(np.max(acc_test))
    
plt.figure()
plt.plot(range(50,1000,50), acc)
plt.plot(range(50,1000,50), acc_test)
plt.show()

## Voting Classifiers

In [8]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import model_selection

In [9]:
clf6 = KNeighborsClassifier(14)
clf7 = RandomForestClassifier(random_state=1, n_estimators=300, max_leaf_nodes=32)
clf8 = svm.SVC()

labels = ['K Nearest Neighbors', 'Random Forest', 'Support Vector Machines']

for clf, label in zip([clf6, clf7, clf8], labels):

    scores = model_selection.cross_val_score(clf, x_train, y_train, 
                                              cv=5, 
                                              scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

Accuracy: 0.57 (+/- 0.03) [K Nearest Neighbors]
Accuracy: 0.62 (+/- 0.06) [Random Forest]
Accuracy: 0.60 (+/- 0.04) [Support Vector Machines]


In [10]:
voting_clf_hard = VotingClassifier(estimators = [(labels[0], clf6),
                                                 (labels[1], clf7),
                                                 (labels[2], clf8)],
                                   voting = 'hard')

In [11]:
voting_clf_soft = VotingClassifier(estimators = [(labels[0], clf6),
                                                 (labels[1], clf7),
                                                 (labels[2], clf8)],
                                   voting = 'soft')

In [12]:
labels_new = ['K Nearest Neighbors', 'Random Forest', 'SVM',
              'Voting_Classifier_Hard', 'Voting_Classifier_Soft']

for (clf, label) in zip([clf6, clf7, clf8, voting_clf_hard,
                        voting_clf_soft], labels_new):
    scores = model_selection.cross_val_score(clf, x_train, y_train, cv=5,
            scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

Accuracy: 0.57 (+/- 0.03) [K Nearest Neighbors]
Accuracy: 0.62 (+/- 0.06) [Random Forest]
Accuracy: 0.60 (+/- 0.04) [SVM]
Accuracy: 0.61 (+/- 0.04) [Voting_Classifier_Hard]


AttributeError: predict_proba is not available when  probability=False