### Comparing the performance of different classical machine learning models in a 2-class classification task.
1. k-Nearest Neighbor
2. Linear Discriminant Analysis
3. Logistic Regression
4. Random Forest

In [1]:
from scipy.io import loadmat

In [2]:
mat = loadmat('twoClassData.mat')

In [3]:
print(mat.keys()) # Which variables mat contains?
# mat contains variable names: X and y

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])


In [4]:
X = mat['X']

In [5]:
X.shape

(400, 2)

In [6]:
y = mat['y'].ravel()

In [7]:
y.shape

(400,)

In [8]:
y

array([1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1.,
       0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
       0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0.,
       0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0.,
       1., 1., 1., 0., 0.

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42)

In [9]:
X_train.shape

(200, 2)

In [10]:
X_test.shape

(200, 2)

In [11]:
y_train.shape

(200,)

In [12]:
y_test.shape

(200,)

### Nearest Neighbor Classifier

In [13]:
# training the classifier
from sklearn.neighbors import KNeighborsClassifier
nn_classifier = KNeighborsClassifier(n_neighbors = 3 , metric = 'euclidean')
nn_classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=3)

In [14]:
# prediction and accuracy score
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = nn_classifier.predict(X_test)
print('Accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred)))
print('ROC-AUC Score: {:.4f}'.format(roc_auc_score(y_test, y_pred)))

Accuracy score: 0.8700
ROC-AUC Score: 0.8684


### Linear Discriminant Analysis

In [15]:
# training the classifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [16]:
# prediction and accuracy score
y_pred_lda = lda_classifier.predict(X_test)
print('Accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred_lda)))
print('ROC-AUC Score: {:.4f}'.format(roc_auc_score(y_test, y_pred_lda)))

Accuracy score: 0.9200
ROC-AUC Score: 0.9187


### Logistic Regression

In [17]:
# training the classifier
from sklearn.linear_model import LogisticRegression
lg_classifier = LogisticRegression()
lg_classifier.fit(X_train, y_train)

LogisticRegression()

In [18]:
# prediction and accuracy score
y_pred_lg = lg_classifier.predict(X_test)
print('Accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred_lg)))
print('ROC-AUC Score: {:.4f}'.format(roc_auc_score(y_test, y_pred_lg)))

Accuracy score: 0.9200
ROC-AUC Score: 0.9195


### Random Forest

In [19]:
# training the classifier
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=20)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=20)

In [20]:
# prediction and accuracy score
y_pred_rf = rf_classifier.predict(X_test)
print('Accuracy score: {:.4f}'.format(accuracy_score(y_test, y_pred_rf)))
print('ROC-AUC Score: {:.4f}'.format(roc_auc_score(y_test, y_pred_rf)))

Accuracy score: 0.8800
ROC-AUC Score: 0.8816


|Model                            |Accuracy Score|ROC-AUC Score|
|---------------------------------|--------------|-------------|
|3-Nearest Neighbor               |0.8700        |0.8684       |  
|Linear Discriminant Analysis     |0.9200        |0.9187       |
|Logistic Regression              |0.9200        |0.9195       |
|Random Forest (n estimators=20)  |0.8800        |0.8816       |