## Q1(a) - Filter

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from scipy import stats

data = pd.read_csv('./tesco_17357376.csv')

data.pop('area_id')

# remove h_n, _std, _norm_ _ci95
suffix_of_col_to_remove = ['h_n','_std','_norm','_ci95']
cols_to_remove = set()
for col in data.columns:
    for suffix in suffix_of_col_to_remove:
        if suffix in col and col not in cols_to_remove:
            cols_to_remove.add(col)
        
data = data.drop(columns = list(cols_to_remove))

y = data.pop("'Diabetes category'").values
le = LabelEncoder()
y = le.fit_transform(y)

data["'Diabetes category'"] = y

data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]

y = data.pop("'Diabetes category'")


# transform target class to integer


X = data.values

X.shape, y.shape

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Plotting to visualize distribution

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

x = ['Low 1', 'Medium 2', 'High 0']
unique, counts = np.unique(y, return_counts=True)
derp = dict(zip(unique, counts))
energy = [80, 302, 118]

x_pos = [i for i, _ in enumerate(x)]

plt.bar(x_pos, energy, color='green')
plt.xlabel("Diabetes")
plt.ylabel("Number of entries")
plt.title("Derp")

plt.xticks(x_pos, x)

plt.show()

In [None]:
# so I normalise everything, and then perform k-fold cross validation to select a model to use.
# Then after I select a model, I will perform a training_test_split to find the actual accuracy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.40, shuffle=True, random_state=42, stratify=y)

scaler = MinMaxScaler()
normalised_X = scaler.fit_transform(X_train)
normalised_X.shape

In [None]:
# Getting ranking of features using info-gain

from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

i_scores = mutual_info_classif(normalised_X, y_train)
X_train_df = pd.DataFrame(data=normalised_X, columns=list(data.columns))

df = pd.DataFrame(i_scores, index = X_train_df
                  .columns, columns=['I-Gain'])
df.loc[df['I-Gain'].duplicated(), 'I-Gain'] = 0
df.sort_values(by=['I-Gain'], ascending=False,inplace=True)
df.head(n=20)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest


best_n_features = 1
best_acc = 0
best_k = 1
best_score = []

In [None]:
# Finding the best parameters by looping through each k in 1 to 10, and doing cross_val with each subset of features
# up to all features.
# Best parameters are saved

for k in range(1, 10):
    scores = []
    model = KNeighborsClassifier(n_neighbors=k)
    for up_to_index_feature in range(1, len(df.index) + 1):
        selected_features = X_train_df[df.index[:up_to_index_feature]]
        top_selected_features = selected_features.values
        curr_acc = cross_val_score(model, top_selected_features, y_train, cv=10, scoring='f1_macro')[0]

        scores.append(curr_acc)

        if (curr_acc > best_acc):
            best_acc = curr_acc
            best_n_features = up_to_index_feature
            best_k = k
            best_score = scores
            
    plt.plot(scores)
    plt.ylabel('accuracy')
    plt.xlabel('top n features')
    plt.show()
        
df.head(n=best_n_features)



In [None]:
print("Mean Acc:\n{}".format(np.mean(best_score)))
print("\nBest N Features:\n{}".format(best_n_features))
print("\nBest Acc:\n{}".format(best_acc))
print("\nBest K:\n{}".format(best_k))

In [None]:
normalised_test_X = scaler.fit_transform(X_test)
normalised_test_X.shape

In [None]:
# Testing the performance of the model using SelectKBest to select the best parameters

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix

selection = SelectKBest(score_func=mutual_info_classif, k=best_n_features).fit(normalised_X, y_train)
X_features = selection.transform(normalised_X)
print(X_features.shape)

model = KNeighborsClassifier(n_neighbors=best_k)
model.fit(X_features, y_train)

X_test_features = selection.transform(normalised_test_X)
y_pred = model.predict(X_test_features)

matrix = classification_report(y_test, y_pred, labels=[0, 1, 2])
print('Classification report : \n',matrix)

print("Accuracy:\n{}".format(accuracy_score(y_test, y_pred)))

plot_confusion_matrix(model, X_test_features, y_test)
confusion = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
print("\nConfusion matrix:\n{}".format(confusion))

## Q1(a) - Wrapper

## SFS k-NN 

In [None]:
# Finding the best parameters by looping through each k in 1 to 10, and doing cross_val with each subset of features
# up to all features.
# Best parameters are saved

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.40, shuffle=True, random_state=42, stratify=y)

scaler = MinMaxScaler()
normalised_X = scaler.fit_transform(X_train)
normalised_X.shape

normalised_X_test = scaler.fit_transform(X_test)
normalised_X_test.shape

best_acc = 0
best_k = 1; 
best_feat_index = []
best_feat_names = []
best_sfs = 0

for k in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors = k)

    sfs_forward = SFS(knn, 
                  k_features=(1, len(X[1])),
                  forward=True, 
                  floating=False, 
                  verbose=1, 
                  scoring='f1_macro',
                  cv=10, 
                  n_jobs=-1)


    sfs_forward = sfs_forward.fit(normalised_X, y_train, custom_feature_names=data.columns)

    print('best combination (ACC: %.3f): %s\n' % (sfs_forward.k_score_, sfs_forward.k_feature_idx_))
    
    if sfs_forward.k_score_ > best_acc:
        best_acc = sfs_forward.k_score_
        best_k = k
        best_feat_index = sfs_forward.k_feature_idx_
        best_feat_names = sfs_forward.k_feature_names_
        best_sfs = sfs_forward

print('\n\nBest acc : \n',best_acc)
print('\n\nBest k : \n',best_k)
       
# Plotting chart for the best SFS
figl = plot_sfs(best_sfs.get_metric_dict(), ylabel='Accuracy', kind='std_dev')

plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

print("Features:")
for feat in best_sfs.k_feature_names_:
    print(feat)

# Testing the performance of the model using SelectKBest to select the best parameters
X_train_sfs = best_sfs.transform(normalised_X)
X_test_sfs = best_sfs.transform(normalised_X_test)

knn = KNeighborsClassifier(n_neighbors = best_k)

knn.fit(X_train_sfs, y_train)
y_pred = knn.predict(X_test_sfs)

matrix = classification_report(y_test, y_pred,labels=[0, 1, 2])
print('Classification report : \n',matrix)

print("\nModel Prediction Accuracy:\n{}".format(accuracy_score(y_test, y_pred)))

confusion = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
print("\nConfusion matrix:\n{}".format(confusion))

transformed_y = []

for target_y in y:
    if target_y == 0:
        transformed_y.append(1)
    else:
        transformed_y.append(0)

X_train, X_test, y_train, y_test = train_test_split(X, transformed_y, test_size=0.40, shuffle=True, random_state=1, stratify=y)

y_score = knn.predict_proba(X_test_sfs)
fprN, tprN, t = roc_curve(y_test, y_score[:,0])
roc_aucN = auc(fprN, tprN)

## SFS GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

sfs_forward = SFS(gnb, 
              k_features=(1, len(X[1])),
              forward=True, 
              floating=False, 
              verbose=1, 
              scoring='f1_macro',
              cv=10, 
              n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, shuffle=True, random_state=1, stratify=y)

scaler = MinMaxScaler()
normalised_X = scaler.fit_transform(X_train)
normalised_X.shape

normalised_X_test = scaler.fit_transform(X_test)
normalised_X_test.shape

# Plotting SFS
sfs_forward = sfs_forward.fit(normalised_X, y_train, custom_feature_names=data.columns)
figl = plot_sfs(sfs_forward.get_metric_dict(), ylabel='Accuracy', kind='std_dev')
plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
print('best combination (ACC: %.3f): %s\n' % (sfs_forward.k_score_, sfs_forward.k_feature_idx_))

print("Features:")
for feat in sfs_forward.k_feature_names_:
    print(feat)
    
# Testing the performance of the model using SelectKBest to select the best parameters
X_train_sfs = best_sfs.transform(normalised_X)
X_test_sfs = best_sfs.transform(normalised_X_test)

gnb.fit(X_train_sfs, y_train)
y_pred = gnb.predict(X_test_sfs)

# Printing evaluation
matrix = classification_report(y_test, y_pred,labels=[0, 1, 2])
print('Classification report : \n',matrix)

print("\nModel Prediction Accuracy:\n{}".format(accuracy_score(y_test, y_pred)))

confusion = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
print("\nConfusion matrix:\n{}".format(confusion))

# transformed_y = []

# for target_y in y:
#     if target_y == 0:
#         transformed_y.append(1)
#     else:
#         transformed_y.append(0)

X_train, X_test, y_train, y_test = train_test_split(X, transformed_y, test_size=1/3, shuffle=True, random_state=1, stratify=y)

y_score = gnb.predict_proba(X_test_sfs)
fprG, tprG, t = roc_curve(y_test, y_score[:,0])
roc_aucG = auc(fprG, tprG)

## SFS Decision Tree Gini

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='gini')

# for k in range(1, len(X[1]) + 1):
sfs_forward = SFS(tree, 
              k_features=(1, len(X[1])),
              forward=True, 
              floating=False, 
              verbose=1, 
              scoring='f1_macro',
              cv=10, 
              n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, shuffle=True, random_state=1, stratify=y)

scaler = MinMaxScaler()
normalised_X = scaler.fit_transform(X_train)
normalised_X.shape

normalised_X_test = scaler.fit_transform(X_test)
normalised_X_test.shape

# Plotting SFS
sfs_forward = sfs_forward.fit(normalised_X, y_train, custom_feature_names=data.columns)
figl = plot_sfs(sfs_forward.get_metric_dict(), ylabel='Accuracy', kind='std_dev')
plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
print('best combination (ACC: %.3f): %s\n' % (sfs_forward.k_score_, sfs_forward.k_feature_idx_))

print("Features:")
for feat in sfs_forward.k_feature_names_:
    print(feat)


# Testing the performance of the model using SelectKBest to select the best parameters
X_train_sfs = best_sfs.transform(normalised_X)
X_test_sfs = best_sfs.transform(normalised_X_test)

tree.fit(X_train_sfs, y_train)
y_pred = tree.predict(X_test_sfs)

# Printing evaluation
matrix = classification_report(y_test, y_pred,labels=[0, 1, 2])
print('Classification report : \n',matrix)

print("\nModel Prediction Accuracy:\n{}".format(accuracy_score(y_test, y_pred)))

confusion = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
print("\nConfusion matrix:\n{}".format(confusion))

# transformed_y = []

# for target_y in y:
#     if target_y == 0:
#         transformed_y.append(1)
#     else:
#         transformed_y.append(0)

X_train, X_test, y_train, y_test = train_test_split(X, transformed_y, test_size=0.35, shuffle=True, random_state=1, stratify=y)

y_score = tree.predict_proba(X_test_sfs)
fprT, tprT, t = roc_curve(y_test, y_score[:,0])
roc_aucT = auc(fprT, tprT)

## Decision Tree Entropy

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy')

# for k in range(1, len(X[1]) + 1):
sfs_forward = SFS(tree, 
              k_features=(1, len(X[1])),
              forward=True, 
              floating=False, 
              verbose=1, 
              scoring='f1_macro',
              cv=10, 
              n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, shuffle=True, random_state=1, stratify=y)

scaler = MinMaxScaler()
normalised_X = scaler.fit_transform(X_train)
normalised_X.shape

normalised_X_test = scaler.fit_transform(X_test)
normalised_X_test.shape

# Testing the performance of the model using SelectKBest to select the best parameters
sfs_forward = sfs_forward.fit(normalised_X, y_train, custom_feature_names=data.columns)
figl = plot_sfs(sfs_forward.get_metric_dict(), ylabel='Accuracy', kind='std_dev')
plt.ylim([0.5, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
print('best combination (ACC: %.3f): %s\n' % (sfs_forward.k_score_, sfs_forward.k_feature_idx_))

print("Features:")
for feat in sfs_forward.k_feature_names_:
    print(feat)
  

    # Testing the performance of the model using SelectKBest to select the best parameters
X_train_sfs = best_sfs.transform(normalised_X)
X_test_sfs = best_sfs.transform(normalised_X_test)

tree.fit(X_train_sfs, y_train)
y_pred = tree.predict(X_test_sfs)

matrix = classification_report(y_test, y_pred,labels=[0, 1, 2])
print('Classification report : \n',matrix)

print("\nModel Prediction Accuracy:\n{}".format(accuracy_score(y_test, y_pred)))

confusion = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
print("\nConfusion matrix:\n{}".format(confusion))


## ROC (e)

In [None]:
# 1 if high, 0 if low or medium. This is due to ROC being a binary classifier
# transformed_y = []

# for target_y in y:
#     if target_y == 0:
#         transformed_y.append(1)
#     else:
#         transformed_y.append(0)
        
# X_train, X_test, y_train, y_test = train_test_split(X, transformed_y, test_size=.35, shuffle=True, random_state=1)

# knn.fit(X_train, y_train)
# y_score = knn.predict_proba(X_test)
# fprN, tprN, t = roc_curve(y_test, y_score[:,1])
# roc_aucN = auc(fprN, tprN)

# gnb.fit(X_train, y_train)
# y_score = gnb.predict_proba(X_test)
# fprG, tprG, t = roc_curve(y_test, y_score[:,1])
# roc_aucG = auc(fprG, tprG)

# tree.fit(X_train, y_train)
# y_score = tree.predict_proba(X_test)
# fprT, tprT, t = roc_curve(y_test, y_score[:,1])
# roc_aucT = auc(fprT, tprT)


# Plot of a ROC curve for a specific class
plt.figure()
lw = 2
plt.plot(fprG, tprG, color='blue', lw=lw, label='GNB, Area = %0.2f'% roc_aucG)
plt.plot(fprN, tprN, color='green', lw=lw, label='kNN, Area = %0.2f'% roc_aucN)
plt.plot(fprT, tprT, color='red', lw=lw, label='DTC, Area = %0.2f'% roc_aucT)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()