In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn import tree
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [95]:
data =  pd.read_csv('data/letter-recognition.data', sep=",", header = None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [96]:
columnNames = ["lettr", "x_box", "y_box", "width", "thigh", "onpix", "x_bar", "y_bar", "x2_bar", "y2_bar", "xy_bar", "x2y_br", "xy2br", "x_ege", "xegvy", "y_ege", "yegvx"]

In [97]:
data.columns = columnNames

In [98]:
HandK = data[(data['lettr'] == 'H') | (data['lettr'] == 'K')]
HandK

Unnamed: 0,lettr,x_box,y_box,width,thigh,onpix,x_bar,y_bar,x2_bar,y2_bar,xy_bar,x2y_br,xy2br,x_ege,xegvy,y_ege,yegvx
21,H,4,5,5,4,4,7,7,6,6,7,6,8,3,8,3,8
28,H,6,9,8,7,6,8,6,6,7,7,7,9,6,8,4,8
64,H,3,3,4,1,2,8,7,5,6,7,6,8,5,8,3,7
66,H,3,5,5,4,3,7,8,3,6,10,6,8,3,8,3,8
69,H,8,12,8,6,4,9,8,4,5,8,4,5,6,9,5,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19921,K,7,14,8,8,5,9,6,3,5,11,3,7,5,7,4,8
19931,K,3,1,5,3,3,6,7,4,8,7,6,11,3,8,5,9
19937,H,3,8,5,6,4,8,8,7,7,7,5,7,3,8,3,7
19957,H,5,7,8,5,5,9,7,3,6,10,4,7,5,8,4,9


In [99]:
MandY = data[(data['lettr'] == 'M') | (data['lettr'] == 'Y')]
MandY

Unnamed: 0,lettr,x_box,y_box,width,thigh,onpix,x_bar,y_bar,x2_bar,y2_bar,xy_bar,x2y_br,xy2br,x_ege,xegvy,y_ege,yegvx
9,M,11,15,13,9,7,13,2,6,2,12,1,9,8,1,1,8
13,M,6,9,8,6,9,7,8,6,5,7,5,8,8,9,8,6
26,M,7,11,11,8,9,3,8,4,5,10,11,10,10,9,5,7
34,M,6,9,9,7,6,5,6,3,5,10,9,9,8,5,2,7
50,Y,7,11,9,8,8,9,5,6,4,7,8,8,3,9,8,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19940,Y,6,9,5,5,2,5,9,3,3,10,9,5,3,10,3,4
19949,M,5,8,7,6,7,8,6,6,5,7,7,10,11,6,2,9
19979,M,6,9,10,7,12,7,5,3,2,7,5,8,15,7,4,6
19982,Y,3,9,5,6,3,7,9,1,6,6,11,8,2,11,2,7


In [100]:
yDataHandK = HandK.lettr
yDataMandY = MandY.lettr
XDataHandK = HandK.drop(["lettr"], axis=1)
XDataMandY = MandY.drop(["lettr"], axis=1)

In [101]:
X_train_MandY, X_test_MandY, y_train_MandY, y_test_MandY = train_test_split(XDataMandY, yDataMandY, test_size=0.1, random_state=42)
X_train_HandK, X_test_HandK, y_train_HandK, y_test_HandK = train_test_split(XDataHandK, yDataHandK, test_size=0.1, random_state=42)

In [102]:
# profile = ProfileReport(data, title="Pandas Profiling Report")
# profile

## M and Y

#### Random Forest Classifier

In [103]:
randomForest = RandomForestClassifier(max_depth=2, random_state=0)
cv_results = cross_validate(randomForest, X_train_MandY, y_train_MandY, cv=5)
print(np.mean(cv_results.get("test_score")))

0.9788732394366196


In [104]:
randomForest.fit(X_train_MandY, y_train_MandY)
y_pred = randomForest.predict(X_test_MandY)
accuracy_score(y_test_MandY, y_pred)

0.9746835443037974

In [105]:
# Tuning max_depth Hyperparameter
max_depths = [1, 2, 3, 5, 7, 10, 13, 15]
scores = []
for i in range(len(max_depths)):
    randomForest = RandomForestClassifier(max_depth=max_depths[i], random_state=0)
    cv_results = cross_validate(randomForest, X_train_MandY, y_train_MandY, cv=5)
    scores.append(np.mean(cv_results.get("test_score")))
    print(np.mean(cv_results.get("test_score")))
print("Best performance: ", max(scores))
print("Ideal max depth: ", max_depths[scores.index(max(scores))])

0.9570422535211268
0.9788732394366196
0.9880281690140844
0.9943661971830986
0.9964788732394366
0.9971830985915492
0.9971830985915492
0.9971830985915492
Best performance:  0.9971830985915492
Ideal max depth:  10


Let's dive deeper into the performance of our model with a confusion matrix.

In [106]:
confusion_matrix(y_test_MandY, y_pred)

array([[82,  0],
       [ 4, 72]], dtype=int64)

#### Decision Tree Classifier

In [107]:
decisionTree = tree.DecisionTreeClassifier()
cv_results = cross_validate(decisionTree, X_train_MandY, y_train_MandY, cv=5)
print(np.mean(cv_results.get("test_score")))

0.9901408450704224


In [108]:
decisionTree.fit(X_train_MandY, y_train_MandY)
y_pred = decisionTree.predict(X_test_MandY)
accuracy_score(y_test_MandY, y_pred)

0.9873417721518988

In [109]:
# Tuning max_depth Hyperparameter
max_depths = [1, 2, 3, 5, 7, 10, 13, 15]
scores = []
for i in range(len(max_depths)):
    decisionTree = tree.DecisionTreeClassifier(max_depth=max_depths[i], random_state=0)
    cv_results = cross_validate(decisionTree, X_train_MandY, y_train_MandY, cv=5)
    scores.append(np.mean(cv_results.get("test_score")))
    print(np.mean(cv_results.get("test_score")))
print("Best performance: ", max(scores))
print("Ideal max depth: ", max_depths[scores.index(max(scores))])

0.95
0.9711267605633804
0.980281690140845
0.9894366197183098
0.9922535211267605
0.9908450704225352
0.9908450704225352
0.9908450704225352
Best performance:  0.9922535211267605
Ideal max depth:  7


Let's dive deeper into the performance of our model with a confusion matrix.

In [110]:
confusion_matrix(y_test_MandY, y_pred)

array([[82,  0],
       [ 2, 74]], dtype=int64)

#### KNN

In [111]:
# Create the k-NN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier to the training data
knn.fit(X_train_MandY, y_train_MandY)

# Predict the classes of the test set
y_pred = knn.predict(X_test_MandY)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_MandY, y_pred)

# Print the accuracy of the model
print("Accuracy:", accuracy)

print()
print("Cross-validation Average Test Score:")

# Perform 10-fold cross-validation
cv_results = cross_validate(knn, X_train_MandY, y_train_MandY, cv=10)
print(np.mean(cv_results.get("test_score")))

Accuracy: 0.9936708860759493

Cross-validation Average Test Score:
0.9985915492957746


In [112]:
# Tuning k Hyperparameter
max_k = 15
scores = []
for k in range(1, max_k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_MandY, y_train_MandY)
    y_pred = knn.predict(X_test_MandY)
    cv_results = cross_validate(knn, X_train_MandY, y_train_MandY, cv=10)
    scores.append(np.mean(cv_results.get("test_score")))

print("Best performance: ", max(scores))
print("Ideal k: ", scores.index(max(scores))+1)

Best performance:  0.9992957746478872
Ideal k:  1


Let's dive deeper into the performance of our model with a confusion matrix.

In [113]:
confusion_matrix(y_test_MandY, y_pred)

array([[82,  0],
       [ 1, 75]], dtype=int64)

#### SVM

In [114]:
# Create the SVM classifier
svm = SVC(kernel='linear', C=1)

# Fit the classifier to the training data
svm.fit(X_train_MandY, y_train_MandY)

# Predict the classes of the test set
y_pred = svm.predict(X_test_MandY)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_MandY, y_pred)

# Print the accuracy of the model
print("Accuracy:", accuracy)

print()
print("Cross-validation Average Test Score:")

# Perform 10-fold cross-validation
cv_results = cross_validate(svm, X_train_MandY, y_train_MandY, cv=10)
print(np.mean(cv_results.get("test_score")))

Accuracy: 1.0

Cross-validation Average Test Score:
0.9971830985915492


In [115]:
# Tuning C Hyperparameter
max_C = 15
scores = []
for C in range(1, max_C):
    svm = SVC(kernel='linear', C=C)
    svm.fit(X_train_MandY, y_train_MandY)
    y_pred = svm.predict(X_test_MandY)
    cv_results = cross_validate(svm, X_train_MandY, y_train_MandY, cv=10)
    scores.append(np.mean(cv_results.get("test_score")))

print("Best performance: ", max(scores))
print("Ideal C: ", scores.index(max(scores))+1)

Best performance:  0.9971830985915492
Ideal C:  1


Let's dive deeper into the performance of our model with a confusion matrix.

In [116]:
confusion_matrix(y_test_MandY, y_pred)

array([[82,  0],
       [ 0, 76]], dtype=int64)

## H and K

#### Random Forest Classifier

In [117]:
randomForest = RandomForestClassifier(max_depth=2, random_state=0)
cv_results = cross_validate(randomForest, X_train_HandK, y_train_HandK, cv=5)
cv_results

{'fit_time': array([0.12717438, 0.11025476, 0.11034441, 0.11127448, 0.10530758]),
 'score_time': array([0.00897765, 0.01052523, 0.00896811, 0.00998306, 0.00901985]),
 'test_score': array([0.84150943, 0.90943396, 0.86037736, 0.85283019, 0.8       ])}

In [118]:
randomForest.fit(X_train_HandK, y_train_HandK)
y_pred = randomForest.predict(X_test_HandK)
accuracy_score(y_test_HandK, y_pred)

0.8851351351351351

In [None]:
# Tuning max_depth Hyperparameter
max_depths = [1, 2, 3, 5, 7, 10, 13, 15]
scores = []
for i in range(len(max_depths)):
    randomForest = RandomForestClassifier(max_depth=max_depths[i], random_state=0)
    cv_results = cross_validate(randomForest, X_train_HandK, y_train_HandK, cv=5)
    scores.append(np.mean(cv_results.get("test_score")))
    print(np.mean(cv_results.get("test_score")))
print("Best performance: ", max(scores))
print("Ideal max depth: ", max_depths[scores.index(max(scores))])

0.8226415094339623
0.8528301886792453
0.8747169811320754
0.9162264150943397
0.9486792452830188


Let's dive deeper into the performance of our model with a confusion matrix.

In [None]:
confusion_matrix(y_test_HandK, y_pred)

#### Decision Tree Classifier

In [None]:
decisionTree = tree.DecisionTreeClassifier()
cv_results = cross_validate(decisionTree, X_train_HandK, y_train_HandK, cv=5)
print(np.mean(cv_results.get("test_score")))

In [None]:
decisionTree.fit(X_train_HandK, y_train_HandK)
y_pred = decisionTree.predict(X_test_HandK)
accuracy_score(y_test_HandK, y_pred)

In [None]:
# Tuning max_depth Hyperparameter
max_depths = [1, 2, 3, 5, 7, 10, 13, 15]
scores = []
for i in range(len(max_depths)):
    decisionTree = tree.DecisionTreeClassifier(max_depth=max_depths[i], random_state=0)
    cv_results = cross_validate(decisionTree, X_train_HandK, y_train_HandK, cv=5)
    scores.append(np.mean(cv_results.get("test_score")))
    print(np.mean(cv_results.get("test_score")))
print("Best performance: ", max(scores))
print("Ideal max depth: ", max_depths[scores.index(max(scores))])

Let's dive deeper into the performance of our model with a confusion matrix.

In [None]:
confusion_matrix(y_test_HandK, y_pred)

#### KNN

In [None]:
# Create the k-NN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the training data
knn.fit(X_train_HandK, y_train_HandK)

# Predict the classes of the test set
y_pred = knn.predict(X_test_HandK)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_HandK, y_pred)

# Print the accuracy of the model
print("Accuracy:", accuracy)

print()
print("Cross-validation Average Test Score:")

# Perform 10-fold cross-validation
cv_results = cross_validate(knn, X_train_HandK, y_train_HandK, cv=10)
print(np.mean(cv_results.get("test_score")))

In [None]:
# Tuning k Hyperparameter
max_k = 15
scores = []
for k in range(1, max_k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_HandK, y_train_HandK)
    y_pred = knn.predict(X_test_HandK)
    cv_results = cross_validate(knn, X_train_HandK, y_train_HandK, cv=10)
    scores.append(np.mean(cv_results.get("test_score")))

print("Best performance: ", max(scores))
print("Ideal k: ", scores.index(max(scores))+1)

Let's dive deeper into the performance of our model with a confusion matrix.

In [None]:
confusion_matrix(y_test_HandK, y_pred)

#### SVM

In [None]:
# Create the SVM classifier
svm = SVC(kernel='linear', C=1)

# Fit the classifier to the training data
svm.fit(X_train_HandK, y_train_HandK)

# Predict the classes of the test set
y_pred = svm.predict(X_test_HandK)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_HandK, y_pred)

# Print the accuracy of the model
print("Accuracy:", accuracy)

print()
print("Cross-validation Average Test Score:")

# Perform 10-fold cross-validation
cv_results = cross_validate(svm, X_train_HandK, y_train_HandK, cv=10)
print(np.mean(cv_results.get("test_score")))

In [None]:
# Tuning C Hyperparameter
max_C = 15
scores = []
for C in range(1, max_C):
    svm = SVC(kernel='linear', C=C)
    svm.fit(X_train_HandK, y_train_HandK)
    y_pred = svm.predict(X_test_HandK)
    cv_results = cross_validate(svm, X_train_HandK, y_train_HandK, cv=10)
    scores.append(np.mean(cv_results.get("test_score")))

print("Best performance: ", max(scores))
print("Ideal C: ", scores.index(max(scores))+1)

Let's dive deeper into the performance of our model with a confusion matrix.

In [None]:
confusion_matrix(y_test_HandK, y_pred)

### L and Y

#### Random Forest Classifier

#### Decision Tree Classifier

#### KNN

#### SVM

# Feature Selection

#### RFE for Random Forest

In [None]:
estimator = RandomForestClassifier(max_depth=2, random_state=0)
selector = RFE(estimator, n_features_to_select=4, step=1)
selector = selector.fit(X_train_HandK, y_train_HandK)

In [None]:
selector.support_

In [None]:
selected = X_train_HandK.loc[:, selector.support_]
selected

In [None]:
randomForest = RandomForestClassifier(max_depth=2, random_state=0)
cv_results = cross_validate(randomForest, selected, y_train_HandK, cv=5)
cv_results

In [None]:
randomForest.fit(selected, y_train_HandK)
test_selected = X_test_HandK.loc[:, selector.support_]
y_pred = randomForest.predict(test_selected)
accuracy_score(y_test_HandK, y_pred)

In [None]:
# Tuning max_depth Hyperparameter
max_depths = [1, 2, 3, 5, 7, 10, 13, 15]
scores = []
for i in range(len(max_depths)):
    randomForest = RandomForestClassifier(max_depth=max_depths[i], random_state=0)
    cv_results = cross_validate(randomForest, selected, y_train_HandK, cv=5)
    scores.append(np.mean(cv_results.get("test_score")))
    print(np.mean(cv_results.get("test_score")))
print("Best performance: ", max(scores))
print("Ideal max depth: ", max_depths[scores.index(max(scores))])

#### RFE for Decision Tree

In [None]:
estimator = tree.DecisionTreeClassifier()
selector = RFE(estimator, n_features_to_select=4, step=1)
selector = selector.fit(X_train_HandK, y_train_HandK)

In [None]:
selector.support_

In [None]:
selected = X_train_HandK.loc[:, selector.support_]
selected

In [None]:
decisionTree = tree.DecisionTreeClassifier()
cv_results = cross_validate(decisionTree, selected, y_train_HandK, cv=5, return_train_score=True)
cv_results

In [None]:
decisionTree.fit(selected, y_train_HandK)
test_selected = X_test_HandK.loc[:, selector.support_]
y_pred = decisionTree.predict(test_selected)
accuracy_score(y_test_HandK, y_pred)

In [None]:
# Tuning max_depth Hyperparameter
max_depths = [1, 2, 3, 5, 7, 10, 13, 15]
scores = []
for i in range(len(max_depths)):
    decisionTree = tree.DecisionTreeClassifier(max_depth=max_depths[i], random_state=0)
    cv_results = cross_validate(decisionTree, selected, y_train_HandK, cv=5)
    scores.append(np.mean(cv_results.get("test_score")))
    print(np.mean(cv_results.get("test_score")))
print("Best performance: ", max(scores))
print("Ideal max depth: ", max_depths[scores.index(max(scores))])

#### Scoring Methods for KNN

In [None]:
# Perform Univariate Feature Selection
selector = SelectKBest(chi2, k=2)
X_train_HandK_selected = selector.fit_transform(X_train_HandK, y_train_HandK)
X_test_HandK_selected = selector.fit_transform(X_test_HandK, y_test_HandK)

# Tuning k Hyperparameter
max_k = 15
scores = []
for k in range(1, max_k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_HandK_selected, y_train_HandK)
    y_pred = knn.predict(X_test_HandK_selected)
    cv_results = cross_validate(knn, X_train_HandK_selected, y_train_HandK, cv=10)
    scores.append(np.mean(cv_results.get("test_score")))

print("Best performance: ", max(scores))
print("Ideal k: ", scores.index(max(scores))+1)


#### RFE for SVM

In [None]:
estimator = SVC(kernel='linear', C=1)
selector = RFE(estimator, n_features_to_select=4, step=1)
selector = selector.fit(X_train_HandK, y_train_HandK)

In [None]:
X_train_HandK_selected = X_train_HandK.loc[:, selector.support_]

In [None]:
# Tuning C Hyperparameter with selected features
max_C = 15
scores = []
for C in range(1, max_C):
    svm = SVC(kernel='linear', C=C)
    svm.fit(X_train_HandK_selected, y_train_HandK)
    y_pred = svm.predict(X_train_HandK_selected)
    cv_results = cross_validate(svm, X_train_HandK_selected, y_train_HandK, cv=10)
    scores.append(np.mean(cv_results.get("test_score")))

print("Best performance: ", max(scores))
print("Ideal C: ", scores.index(max(scores))+1)