# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from numpy import *
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
from sklearn import tree
from sklearn.metrics import roc_auc_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')

# Preprocessing the data

In [None]:
attrs_ = list()
with open("Data/attributes.txt", "r") as in_f:
    for line in in_f.readlines():
        attrs_.append(line.strip().split(" ")[1])

with open("Data/featureData.txt", "r") as in_f:
    raw_data = in_f.read()
    temp = np.array([list(map(float, line.strip().split(" "))) for line in raw_data.strip().split("\n")])
    
    data_df = pd.DataFrame({attr:values.tolist() for attr, values in zip(attrs_, temp.T)})

with open("Data/image_class_labels.txt", "r") as in_f:
    raw_data_ = in_f.read()
    temp_ = [int(line.strip().split(" ")[1]) for line in raw_data_.strip().split("\n")]
data_df["labels"] = temp_

X = data_df.iloc[:,:-1].values
Y = data_df.iloc[:,-1:].values.ravel()

size=X.shape[0]

# Model Selection

In [None]:
def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):

    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
        
    filtered_vals = [v for _, v in roc_auc_dict.items() if v != 0]
    average = sum(filtered_vals) / len(filtered_vals) 

    return average

# 10 Fold Cross Validation

In [None]:
def ten_fold_cross_validation(size):

    index = np.zeros((10,6))

    partition_size =  math.floor(size * 0.1 )

    #Creating indices for partitions
    for i in range(0, 10):
        if i == 0:
            index[i] = [0, 0, 0, partition_size, partition_size, size]
            
        elif i == 9:
            index[i] = [0,(size - partition_size), (size - partition_size), size, 0, 0]
            
        else:
            index[i] = [0,(partition_size * i),(partition_size * i),(partition_size * (i + 1)),(partition_size * (i + 1)), size]

    return index

# PCA

In [None]:
def PCA(X_train, X_test):

    principle_components = []
    index = 0
    total_variance = 0

    # Computing the covariance matrix of the data
    X_Cov =  np.cov(np.transpose(X_train))

    # Computing eigenvalues and eigenvectors from covariance matrix
    eigen_values, eigen_vectors = np.linalg.eig(X_Cov)

    # Sorting eigen values and vectors in descending order
    sorted_indexes = np.argsort(eigen_values)[::-1]
    eigen_values =  eigen_values[sorted_indexes]
    eigen_vectors = eigen_vectors[sorted_indexes]

    # Converting eigen vectors to row vectors
    eigen_vectors = np.transpose(eigen_vectors)


    # Computing the variance from eigen values
    variances = np.array([])

    # Computing the total variance
    total = np.sum(eigen_values)

    # Computing variance for each principle component
    for e in eigen_values:
        variances = np.append(variances, ((e/total) * 100))
        
    # Iterating till minimum retained variance is reached
    while(total_variance < 99):
        total_variance = total_variance + variances[index]
        principle_components.append(eigen_vectors[index])
        index = index + 1

    #Converting the row vector into column vector
    principle_components = np.transpose(principle_components)
    
    # Reducing the dimensionality of the data
    X_train = np.dot(X_train, principle_components)
    X_test = np.dot(X_test, principle_components)
    
    return X_train, X_test

# KNN

In [None]:

area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    #Apply PCA
    X_train, X_test = PCA(X_train, X_test)
    
    # Performing KNN
    knn = KNeighborsClassifier(n_neighbors = 100)

    knn.fit(X_train,Y_train)
    
    Y_pred = knn.predict(X_test)

    # Computing Area under ROC
    
    score=roc_auc_score_multiclass(Y_test, Y_pred)
    
    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test, Y_pred))
    
print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))

# SVM

In [None]:

area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    #Apply PCA
    X_train, X_test = PCA(X_train, X_test)
    
    # Performing SVM
    clf_svm = svm.SVC(kernel='rbf')

    clf_svm.fit(X_train,Y_train)
    
    Y_pred = clf_svm.predict(X_test)

    # Computing Area under ROC
    
    score=roc_auc_score_multiclass(Y_test, Y_pred)
        
    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test, Y_pred))

print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))

# Naive Bayes

In [None]:

area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    #Apply PCA
    X_train, X_test = PCA(X_train, X_test)
    
    # Performing Naive Bayes
    gnb = GaussianNB()
    gnb.fit(X_train,Y_train)
    
    Y_pred = gnb.predict(X_test)

    # Computing Area under ROC
    
    score= roc_auc_score_multiclass(Y_test, Y_pred)

    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test, Y_pred))

print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))

# Logistic Regression

In [None]:

area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    # Performing Logistic Regression
    logmodel = LogisticRegression()

    logmodel.fit(X_train,Y_train)
    
    Y_pred = logmodel.predict(X_test)

    # Computing Area under ROC
    try:
        score= roc_auc_score_multiclass(Y_test, Y_pred)
        
    except ValueError:
        pass
   
    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test,Y_pred))

print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))

# Decision Tree

In [None]:


area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    # Performing Decision Tree Classification
    clf_tree = tree.DecisionTreeClassifier()
    clf_tree.fit(X_train,Y_train)
    
    Y_pred = clf_tree.predict(X_test)

    # Computing Area under ROC
    try:
        score= roc_auc_score_multiclass(Y_test, Y_pred)
        
    except ValueError:
        pass
    
    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test,Y_pred))

print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))

# Gradient Boosting

In [None]:

area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    
    # Performing Gradient Boosting
    model_gc = GradientBoostingClassifier(n_estimators=100,learning_rate=0.01,max_depth=3,random_state=42)

    model_gc.fit(X_train,Y_train)
    
    Y_pred = model_gc.predict(X_test)

    # Computing Area under ROC
    
    score=roc_auc_score_multiclass(Y_test, Y_pred)

    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test, Y_pred))

print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))

# Random Forest

In [None]:

area = np.array([])
acc = np.array([])

for ind in ten_fold_cross_validation(size):
     
    # Creating Partitions of X and Y Data
    tn1_s, tn1_e, te_s, te_e, tn2_s, tn2_e = ind.astype(int)
    X_test = X[te_s:te_e, :]
    Y_test = Y[te_s:te_e]
    X_train = np.concatenate((X[tn1_s:tn1_e, :], X[tn2_s:tn2_e, :]))
    Y_train = np.concatenate((Y[tn1_s:tn1_e], Y[tn2_s:tn2_e]))
    
    
    # Performing Random Forest
    clf = RandomForestClassifier(n_estimators=5)

    clf.fit(X_train,Y_train)
    
    Y_pred = clf.predict(X_test)

    # Computing Area under ROC 

    score=roc_auc_score_multiclass(Y_test, Y_pred)

    area = np.append(area, score)
    
    # Computing Accuracy at each step
    acc = np.append(acc,accuracy_score(Y_test, Y_pred))

print("Results with PCA:")
print("Area under ROC curve:",np.mean(area))
print("Accuracy:",np.mean(acc))