In [597]:
#!pip install ucimlrepo

In [598]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder



In [599]:
def get_data(name):
    from ucimlrepo import fetch_ucirepo

    features, targets = np.array([]), np.array([])

    # fetch dataset
    if name == "yeast":
        dataset = fetch_ucirepo(id=110)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "spambase":
        dataset = fetch_ucirepo(id=94)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "rice":
        dataset = fetch_ucirepo(id=545)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "churn":
        dataset = fetch_ucirepo(id=563)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "health_nutri":
        dataset = fetch_ucirepo(id=887)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "zoo":    # Good small ds.
        dataset = fetch_ucirepo(id=602)  # should be 111
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "parkinsons":    # 197x22 Good for all. Fit for comparisons
        dataset = fetch_ucirepo(id=174)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "glass":    # not that useful
        dataset = fetch_ucirepo(id=42)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "scale":  
        dataset = fetch_ucirepo(id=12)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()
    elif name == "wholesale":  
        dataset = fetch_ucirepo(id=257)
        features = dataset.data.features.to_numpy()
        targets = dataset.data.targets.to_numpy()
        targets = targets.ravel()

        # onehotencoder = OneHotEncoder(sparse_output=False)
        # encoded = onehotencoder.fit_transform(features[:,0].reshape(-1, 1))
        # features = np.concatenate((encoded, features[:, 1:]), axis=1)
        
    elif name == "thyroid":  #useless
        data = pd.read_csv('ann-train.data', header=None)

        # Assuming the last column is the class label
        features = data.iloc[:, :-1]
        labels = data.iloc[:, -1]
        print("Features:\n", features.head())
        print("Labels:\n", labels.head())
    elif name == "marketing":   # awesome for ANN
        data = pd.read_csv('marketing_campaign.csv', sep="\t")
        # drop missing values
        data = data.dropna()

        features = data.iloc[:, :-1].to_numpy()
        targets = data.iloc[:, -1].to_numpy()
        targets = targets.ravel()

        
        onehotencoder = OneHotEncoder(sparse_output=False)

        encoded = onehotencoder.fit_transform(features[:,2].reshape(-1, 1))

        ft1 = np.concatenate((features[:, :2], encoded, features[:, 3:]), axis=1)
        # now we encode the marital status
        encoded = onehotencoder.fit_transform(ft1[:,7].reshape(-1, 1))
        ft2 = np.concatenate((ft1[:, :7], encoded, ft1[:, 8:]), axis=1)
        #print("new features2 \n", ft2[0])

        # removing the date column
        ft2 = np.concatenate((ft2[:, :18], ft2[:, 19:]), axis=1)
        features = ft2
    elif name == "diabetes-pima":
        data = pd.read_csv('diabetes.csv')
        
        features = data.iloc[:, :-1].to_numpy()
        targets = data.iloc[:, -1].to_numpy()
        targets = targets.ravel()


    return features, targets

In [600]:
def encode_onehot(labels):
    onehotencoder = OneHotEncoder(sparse_output=False)
    encoded_categories = onehotencoder.fit_transform(labels.reshape(-1, 1))

    return encoded_categories

In [601]:
def run_knn(data_name, encode_type=None):
    X, y = get_data(data_name)
    test_size = 0.20

    if encode_type == "onehot":
        y = encode_onehot(y)
    elif encode_type == "label":
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y = le.fit_transform(y)

    # splitting into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # feature scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    # run grid_search for hyperparameter tuning.
    # print("Running grid search")
    # best_params, best_scores = run_grid_search(X_train, y_train)
    # print(f"Grid search done. Best params: {best_params} \nBest score: {best_scores}")


    #classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier = KNeighborsClassifier(n_neighbors=3,algorithm="auto",leaf_size=10)
    # classifier = KNeighborsClassifier(n_neighbors=30, p=2, algorithm="auto", leaf_size=10, weights="distance") #{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 30, 'p': 1, 'weights': 'distance'}
    classifier.fit(X_train, y_train)

    # predicting on training and test sets
    y_pred_train = classifier.predict(X_train)
    y_pred_test = classifier.predict(X_test)

    print("pridiction done.")
    # cm_train = confusion_matrix(y_train, y_pred_train)
    print(f"KNN Train-Test for {data_name}= {accuracy_score(y_train, y_pred_train)}, {accuracy_score(y_test, y_pred_test)}")
    #print(f"KNN Out-sample prediction for {data_name}= {accuracy_score(y_test, y_pred_test)}")

In [602]:
def run_svm(data_name, encode_type=None):
    X, y = get_data(data_name)
    test_size = 0.20

    if encode_type == "onehot":
        y = encode_onehot(y)
    elif encode_type == "label":
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y = le.fit_transform(y)

    # splitting into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # feature scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    # run grid_search for hyperparameter tuning.
    # print("Running grid search")
    # best_params, best_scores = run_grid_search(X_train, y_train)
    # print(f"Grid search done. Best params: {best_params} \nBest score: {best_scores}")

    # training on train set
    classifier = SVC(kernel="poly", degree=1, random_state=0)
    #classifier = SVC(kernel="poly", degree=4, random_state=0)
    #classifier = SVC(kernel="rbf", degree=3, random_state=0)
   
    classifier.fit(X_train, y_train)
  
    # predicting on training and test sets
    y_pred_train = classifier.predict(X_train)
    y_pred_test = classifier.predict(X_test)


    #cm_train = confusion_matrix(y_train, y_pred_train)
    print(f"SVM TrainTest for {data_name}= {accuracy_score(y_train, y_pred_train)}, {accuracy_score(y_test, y_pred_test)}")
    #print(f"SVM Out-sample prediction for {data_name}= {accuracy_score(y_test, y_pred_test)}")

In [603]:
def run_ann(data_name, encode_type=None):
    import random
    seed_value = 0
    # 1. Set seed for Python's random module
    random.seed(seed_value)
    # 2. Set seed for NumPy
    np.random.seed(seed_value)

    # 3. Set seed for TensorFlow
    tf.random.set_seed(seed_value)


    X, y = get_data(data_name)
    test_size = 0.25

    if encode_type == "onehot":
        y = encode_onehot(y)
    elif encode_type == "label":
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y = le.fit_transform(y)

    # splitting into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # feature scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)



    # run grid_search for hyperparameter tuning.

    # create the ANN model
    # initialize the ann
    ann = tf.keras.models.Sequential()
    ann.add(tf.keras.layers.Dense(units=X.shape[1], activation='relu')) # input and first hidden layer
    ann.add(tf.keras.layers.Dense(units=64, activation='relu'))  # 2nd hidden layer
    ann.add(tf.keras.layers.Dense(units=64, activation='relu'))  # 3rd hidden layer
    ann.add(tf.keras.layers.Dense(units=64, activation='relu'))

    if y.ndim == 1:
        ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
        ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    else:
        ann.add(tf.keras.layers.Dense(units=y.shape[1], activation='softmax'))
        ann.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    ann.fit(X_train, y_train, batch_size=32, epochs=90, shuffle=False)

    # Predictions
    y_pred_train = ann.predict(X_train)

    # Evaluate the model
    loss_train, accuracy_train = ann.evaluate(X_train, y_train)
    loss_test, accuracy_test = ann.evaluate(X_test, y_test)
    print(f"ANN TrainTest Accuracy: {accuracy_train:.2f}, {accuracy_test:.2f}")
    #print(f"ANN Test Accuracy: {accuracy_test:.2f}")

In [604]:
# run_ann("zoo")   # perfect against ann
#run_knn("marketing")
# run_ann("marketing")  # best for ANN
#run_svm("parkinsons")
#run_svm("glass")    # don't encode y here
#run_ann("glass", encode_type="onehot")
run_knn("rice", encode_type="onehot")
run_svm("rice")
run_ann("rice", encode_type="onehot")


DatasetNotFoundError: 
Invalid `prisma.donated_datasets.findFirst()` invocation:


Timed out fetching a new connection from the connection pool. More info: http://pris.ly/d/connection-pool (Current connection pool timeout: 10, connection limit: 17)

In [None]:
# X, y = get_data("wholesale")
# print(y[0:5])
# X[0]
#data.isna().sum()
#list(set(features[:,3]))

# data1 = pd.read_csv('diabetes.csv')
# # drop missing values
# #data = data.dropna()
# data1.isna().sum()

# ft = data1.iloc[:, :-1].to_numpy()
# tg = data1.iloc[:, -1].to_numpy()

In [None]:
#ft[0]

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# onehotencoder = OneHotEncoder(sparse_output=False)
# labels = features[:,2]
# encoded = onehotencoder.fit_transform(labels.reshape(-1, 1))
# #print(encoded[0])

# ft1 = np.concatenate((features[:, :2], encoded, features[:, 3:]), axis=1)
# print("new features \n", ft1[0])

# print("Encoding shape: ", ft1[:,7].shape, list(set(ft1[:,7])))
# # now we encode the marital status
# encoded = onehotencoder.fit_transform(ft1[:,7].reshape(-1, 1))
# print("Shape2: ", encoded.shape)
# ft2 = np.concatenate((ft1[:, :7], encoded, ft1[:, 8:]), axis=1)
# print("new features2 \n", ft2[0])

# # removing the date column
# ft2 = np.concatenate((ft2[:, :18], ft2[:, 19:]), axis=1)
# print("final features2 \n", ft2[0])

# TODOS:
- plot validation curve for KNN, SVM and ANN (for each dataset)
- hyperparameter tuning:
  - knn at least k,
  - svm at least kernel, p etc
  - ann: ? number of hidden layers, epoch count
- performance vs training size for knn and svm
- wall clock times (fit and predict times) for each algos and ds.

# RESULTS
# KNN
  # (.94,.96), (.95, 1.0)
  # marketing:
   # (.88,.85), (92, 84)

# SvM
  #(1,.9), (1,1 rbf),

# ANN:
  # 0.43,.35)
  # marketing: (92,89), (1, 87), (.99, 86)
============== RESULTS
SCALE:
	KNN: .94,.83
	SVM: .91, .95
	ANN, .00, .97
UserKnowledge(257):  encoding ruins knn. svm poor with poly
	knn: 86,82
	svm: 93,85
	ann: 96,85

MARKETING:
	KNN: (87, 81) on k4, (92,85) on k2
	SVM: (94, 86) on poly4, (92, 86) rbf
	ANN: (1,86)

	
IONOSPHERE:
	ANN and SVM in 90s, knn in 80s.
	SVM linear best. Poly is poor