# Target Data Partitioned into Three Categories

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [None]:
premise_file = "../Data/threecleaneddata.csv"
premise_df = pd.read_csv(premise_file)
premise_df.head()

In [None]:
# Drop unneccseary columns
data=premise_df.drop(["Unnamed: 0", "url", "Shares", "Category"], axis=1)
data.head()

In [None]:
# Identifying the target column for dataset
target=premise_df["Category"]
target.head()

In [None]:
# Split data set into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

### KNN Model without scaling and No One-Hot-Encoding

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 50, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
plt.plot(range(1, 50, 2), train_scores, marker='o', color="cornflowerblue", label="Training Data")
plt.plot(range(1, 50, 2), test_scores, marker="x", color="firebrick", label="Testing Data")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.title("Accuracy of Training Set vs Test Set (Non-Scaled Values)")
plt.ylim([0,1.2])
plt.grid()
plt.legend(loc="upper right")
plt.savefig("../graphics/knn_graphics/3cat_noScaling_noOHE.png", bbox_inches="tight")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=43)
knn.fit(X_train, y_train)
print('k=43 Test Acc: %.3f' % knn.score(X_test, y_test))

In [None]:
y_pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### KNN Model with scaling and No One-Hot-Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores_scaled = []
test_scores_scaled = []
for k in range(1, 36, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores_scaled.append(train_score)
    test_scores_scaled.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
plt.plot(range(1, 36, 2), train_scores_scaled, marker='o', color="cornflowerblue", label="Training Data")
plt.plot(range(1, 36, 2), test_scores_scaled, marker="x", color="firebrick", label="Testing Data")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.title("Accuracy of Training Set vs Test Set (Scaled Values)")
plt.ylim([0,1.2])
plt.grid()
plt.legend(loc="upper right")
plt.savefig("../graphics/knn_graphics/3cat_Scaling_noOHE.png", bbox_inches="tight")
plt.show()

In [None]:
# Note that k: 31 provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=33)
knn.fit(X_train_scaled, y_train)
print('k=33 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
y_pred_scaled = knn.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, y_pred_scaled))
print(classification_report(y_test, y_pred_scaled))

### KNN Model with One-Hot-Encoding and without scaling

In [None]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_test_categorical

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores_categorical = []
test_scores_categorical = []
for k in range(1, 75, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train_categorical)
    train_score = knn.score(X_train, y_train_categorical)
    test_score = knn.score(X_test, y_test_categorical)
    train_scores_categorical.append(train_score)
    test_scores_categorical.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
plt.plot(range(1, 75, 2), train_scores_categorical, marker='o', color="cornflowerblue", label="Training Data")
plt.plot(range(1, 75, 2), test_scores_categorical, marker="x", color="firebrick", label="Testing Data")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.title("Accuracy of Training Set vs Test Set (One-Hot Encoded and Non-Scaled Values)")
plt.ylim([0,1.2])
plt.grid()
plt.legend(loc="upper right")
plt.savefig("../graphics/knn_graphics/3cat_noScaling_OHE.png", bbox_inches="tight")
plt.show()

In [None]:
# Note that k: 31 provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=63)
knn.fit(X_train, y_train_categorical)
print('k=63 Test Acc: %.3f' % knn.score(X_test, y_test_categorical))

In [None]:
y_pred_ohe = knn.predict(X_test)

### KNN Model with scaling and One-Hot-Encoding

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores_scaled_categorical = []
test_scores_scaled_categorical = []
for k in range(1, 36, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train_categorical)
    train_score = knn.score(X_train_scaled, y_train_categorical)
    test_score = knn.score(X_test_scaled, y_test_categorical)
    train_scores_scaled_categorical.append(train_score)
    test_scores_scaled_categorical.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
plt.plot(range(1, 36, 2), train_scores_scaled_categorical, marker='o', color="cornflowerblue", label="Training Data")
plt.plot(range(1, 36, 2), test_scores_scaled_categorical, marker="x", color="firebrick", label="Testing Data")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.title("Accuracy of Training Set vs Test Set (One-Hot Encoded and Scaled Values)")
plt.ylim([0,1.2])
plt.grid()
plt.legend(loc="upper right")
plt.savefig("../graphics/knn_graphics/3cat_Scaling_OHE.png", bbox_inches="tight")
plt.show()