In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the csv file and create a Pandas DataFrame
df = pd.read_csv('16P.csv', encoding='cp1252')

# Drop the 'Response Id' column from the DataFrame
df = df.drop(columns={'Response Id'})

# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,You regularly make new friends.,You spend a lot of your free time exploring various random topics that pique your interest,Seeing other people cry can easily make you feel like you want to cry too,You often make a backup plan for a backup plan.,"You usually stay calm, even under a lot of pressure","At social events, you rarely try to introduce yourself to new people and mostly talk to the ones you already know",You prefer to completely finish one project before starting another.,You are very sentimental.,You like to use organizing tools like schedules and lists.,Even a small mistake can cause you to doubt your overall abilities and knowledge.,...,You believe that pondering abstract philosophical questions is a waste of time.,"You feel more drawn to places with busy, bustling atmospheres than quiet, intimate places.",You know at first glance how someone is feeling.,You often feel overwhelmed.,You complete things methodically without skipping over any steps.,You are very intrigued by things labeled as controversial.,You would pass along a good opportunity if you thought someone else needed it more.,You struggle with deadlines.,You feel confident that things will work out for you.,Personality
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,-1,0,0,0,0,0,ENFP
1,0,0,-2,-3,-1,2,-2,0,3,0,...,0,-2,0,2,0,-1,-1,-1,3,ISFP
2,0,0,2,0,-1,2,0,0,1,0,...,0,2,0,2,-1,0,1,2,1,INFJ
3,0,-1,3,-1,0,0,-2,0,-2,0,...,0,0,-1,-1,0,1,0,-2,-1,ISTP
4,0,0,-1,0,2,-1,-2,0,1,0,...,0,1,0,2,0,1,-1,2,-1,ENFJ


In [3]:
# Convert DataFrame to array
array = df.to_numpy()

In [4]:
array

array([[0, 0, 0, ..., 0, 0, 'ENFP'],
       [0, 0, -2, ..., -1, 3, 'ISFP'],
       [0, 0, 2, ..., 2, 1, 'INFJ'],
       ...,
       [0, 0, 1, ..., 0, -1, 'ISTP'],
       [0, 0, 1, ..., 1, 0, 'ISTJ'],
       [0, 0, 2, ..., 0, -1, 'INFJ']], dtype=object)

In [5]:
len(array[0])

61

In [6]:
# Get the number of columns in the array
n = len(array[0])-1

# Extract the labels from the array
labels = [i[n] for i in array]

# Remove the label column from the array
array = np.delete(array, n, axis=1)

# Convert the labels to a numpy array
labels = np.array(labels)

In [7]:
array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, -2, ..., -1, -1, 3],
       [0, 0, 2, ..., 1, 2, 1],
       ...,
       [0, 0, 1, ..., -1, 0, -1],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 2, ..., 1, 0, -1]], dtype=object)

In [8]:
labels

array(['ENFP', 'ISFP', 'INFJ', ..., 'ISTP', 'ISTJ', 'INFJ'], dtype='<U4')

In [9]:
# Define a list of personality types
personality_types = ["ESTJ", "ENTJ", "ESFJ", "ENFJ", "ISTJ", "ISFJ", "INTJ", "INFJ", "ESTP", 
                     "ESFP", "ENTP", "ENFP", "ISTP", "ISFP", "INTP", "INFP"]


# Encode a personality type as an integer
def encode_personality_type(personality_type):
    return personality_types.index(personality_type)

# Decode an encoded personality type as a string
def decode_personality_type(index):
    return personality_types[index]

In [10]:
# Divide the given array into 5 folds for cross-validation.
def five_fold(array):
    global X_trains,X_tests,y_trains,y_tests
    
    # Size of each fold
    size = 12000
    
    # Split the array into 5 folds
    fold1 = array[0:size]                                            
    fold2 = array[size:size*2]
    fold3 = array[size*2:size*3]
    fold4 = array[size*3:size*4]
    fold5 = array[size*4:size*5]
    
    # Split the labels into 5 folds
    label1 = labels[0:size]                                            
    label2 = labels[size:size*2]
    label3 = labels[size*2:size*3]
    label4 = labels[size*3:size*4]
    label5 = labels[size*4:size*5]
    
    # Concatenate the folds to create the training sets
    X1_train = np.concatenate((fold1,fold2,fold3,fold4))
    X2_train = np.concatenate((fold1,fold2,fold3,fold5))
    X3_train = np.concatenate((fold1,fold2,fold5,fold4))
    X4_train = np.concatenate((fold1,fold5,fold3,fold4))
    X5_train = np.concatenate((fold5,fold2,fold3,fold4))
    
    # Assign each fold to be a test set
    X1_test = fold5
    X2_test = fold4
    X3_test = fold3
    X4_test = fold2
    X5_test = fold1
    
    # Concatenate the labels to create the training sets
    y1_train = np.concatenate((label1,label2,label3,label4))
    y2_train = np.concatenate((label1,label2,label3,label5))
    y3_train = np.concatenate((label1,label2,label5,label4))
    y4_train = np.concatenate((label1,label5,label3,label4))
    y5_train = np.concatenate((label5,label2,label3,label4))
    
    # Assign each label fold to be a test set
    y1_test = label5
    y2_test = label4
    y3_test = label3
    y4_test = label2
    y5_test = label1
    
    # Store the training and test sets for each fold in global variables
    X_trains = X1_train, X2_train, X3_train, X4_train, X5_train
    X_tests = X1_test, X2_test, X3_test, X4_test, X5_test
    y_trains = y1_train, y2_train, y3_train, y4_train, y5_train
    y_tests = y1_test, y2_test, y3_test, y4_test, y5_test

In [11]:
five_fold(array)

In [12]:
def scale(array):
    # Get the number of rows and columns in the array
    n = len(array)
    m = len(array[0])
    
    for i in range(m):
        # Find the maximum and minimum values in the column
        max_val = max(array[j][i] for j in range(n))
        min_val = min(array[j][i] for j in range(n))
        
        # Scale the values in the column
        for j in range(n):
            array[j][i] = (array[j][i] - min_val) / (max_val - min_val)

In [13]:
# Find the most common element in a list
def majority_vote(neighbors):
    # Return the element that appears most often in the list
    return max(set(neighbors), key=neighbors.count)

In [14]:
def majority_vote_weighted(neighbors):
    hashmap = {}
    for value, weight in neighbors:
        # If the element is already in the dictionary, add its weight to the total
        if weight in hashmap:
            hashmap[weight] += 1/value
        # If the element is not in the dictionary, add it and its weight
        else:
            hashmap[weight] = 1/value
    
    # Return the element with the highest weight
    return max(hashmap, key=hashmap.get)

In [15]:
# Perform k-nearest neighbors classification on the given data
def knn(X_train, y_train, X_test, y_test, k_array, standard):
    # Encode the personality types
    y_train = [encode_personality_type(y) for y in y_train]
    y_test = [encode_personality_type(y) for y in y_test]
    
    neighbors = []
    for x in X_test:
        # Calculate the distances between the test sample and the training samples
        distances = (np.sum((x - X_train)**2, axis=1))**(1/2)

        # Sort the training samples by distance
        if standard:
            y_sorted = [y for _, y in sorted(zip(distances,y_train))]
        else:
            y_sorted = [y for y in sorted(zip(distances,y_train))]
            
        # Add the sorted list of neighbors to the list
        neighbors.append(y_sorted)
        
    accuracies, precisions, recalls = [], [], []
    
    nearest_neighbors = []
    for k in k_array:
        nearest_neighbors.append([n[:k] for n in neighbors])
    
    # Calculate the metrics for each value of k
    for n in nearest_neighbors:
        accuracy, precision, recall = metrics(X_train, y_train, X_test, y_test, n, standard)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        
    return accuracies, precisions, recalls

In [16]:
def metrics(X_train, y_train, X_test, y_test, neighbors, standard):
    # Use the majority vote function to classify the test samples
    if standard:
        y_pred = list(map(majority_vote, neighbors))
    else:
        y_pred = list(map(majority_vote_weighted, neighbors))
    
    true = 0
    for i in range(len(y_test)):
        # If the sample is correctly classified, increment the counter
        if y_test[i] == y_pred[i]:
            true += 1
    
    # Calculate the accuracy     
    accuracy = true / len(y_test)
    precisions, recalls = [], []

    for types in personality_types:
        # Encode the types label
        types_encoded = encode_personality_type(types)
        
        TP = 0
        FP = 0
        FN = 0
        for i in range(len(y_test)):
            if y_test[i] == types_encoded:
                if y_pred[i] == y_test[i]:
                    # If the sample is correctly classified, increment the true positive counter
                    TP += 1
                else:
                    # If the sample is misclassified, increment the false negative counter
                    FN += 1
            if y_pred[i] == types_encoded:
                if y_test[i] != y_pred[i]:
                    # If the sample is misclassified, increment the false positive counter
                    FP += 1
        
        # Calculate the recall and precision for the personality type       
        recall = TP/(TP+FN)
        precision = TP/(TP+FP)
        
        # Add the recall and precision to the lists
        precisions.append(precision)
        recalls.append(recall)
        
    return accuracy, precisions, recalls

In [17]:
# Print the results
def print1(accuracies, precisions, recalls):
    print("Accuracies: ")
    for i in accuracies:
        print(i, end = " ")
    
    print("\n\nPrecisions: ")
    for i in precisions:
        print(i)
    
    print("\nRecalls: ")
    for i in recalls:
        print(i)
    
    print()

In [18]:
k_array = [1,3,5,7,9]

In [19]:
for i in range(5):
    # Classify the test samples using k-nn with the standard approach
    accuracies, precisions, recalls = knn(X_trains[i], y_trains[i], X_tests[i], y_tests[i], k_array, True)
    print1(accuracies, precisions, recalls)

Accuracies: 
0.9616666666666667 0.9833333333333333 0.9841666666666666 0.9833333333333333 0.9825 

Precisions: 
[0.9620253164556962, 0.8902439024390244, 0.9285714285714286, 0.975609756097561, 0.9753086419753086, 0.9662921348314607, 0.9473684210526315, 0.9605263157894737, 0.9866666666666667, 0.9692307692307692, 0.9875, 0.958904109589041, 1.0, 0.9861111111111112, 0.9210526315789473, 0.9696969696969697]
[0.9871794871794872, 0.9868421052631579, 0.9636363636363636, 0.9878048780487805, 0.9761904761904762, 0.9888888888888889, 1.0, 0.9863013698630136, 0.9868421052631579, 0.9565217391304348, 0.9876543209876543, 0.9726027397260274, 1.0, 1.0, 0.9487179487179487, 1.0]
[0.9746835443037974, 0.9868421052631579, 0.9473684210526315, 1.0, 0.9759036144578314, 0.978021978021978, 1.0, 0.9864864864864865, 0.9868421052631579, 0.9701492537313433, 0.9876543209876543, 0.9861111111111112, 1.0, 1.0, 0.961038961038961, 1.0]
[0.9625, 0.9868421052631579, 0.9818181818181818, 1.0, 0.9759036144578314, 0.9777777777777777

In [20]:
for i in range(5):
    # Classify the test samples using k-nn with the standard approach
    accuracies, precisions, recalls = knn(X_trains[i] ,  y_trains[i] , X_tests[i] , y_tests[i], k_array, False)
    print1(accuracies, precisions, recalls)

Accuracies: 
0.9616666666666667 0.9841666666666666 0.985 0.985 0.9833333333333333 

Precisions: 
[0.9620253164556962, 0.8902439024390244, 0.9285714285714286, 0.975609756097561, 0.9753086419753086, 0.9662921348314607, 0.9473684210526315, 0.9605263157894737, 0.9866666666666667, 0.9692307692307692, 0.9875, 0.958904109589041, 1.0, 0.9861111111111112, 0.9210526315789473, 0.9696969696969697]
[0.9871794871794872, 0.9868421052631579, 0.9642857142857143, 1.0, 0.9759036144578314, 0.978021978021978, 1.0, 0.9733333333333334, 0.9868421052631579, 0.9705882352941176, 0.9876543209876543, 0.9861111111111112, 1.0, 1.0, 0.9487179487179487, 1.0]
[0.9871794871794872, 0.9868421052631579, 0.9473684210526315, 1.0, 0.9759036144578314, 0.978021978021978, 1.0, 0.9864864864864865, 0.9868421052631579, 0.9705882352941176, 0.9876543209876543, 0.9861111111111112, 1.0, 1.0, 0.961038961038961, 1.0]
[0.9871794871794872, 0.9868421052631579, 0.9818181818181818, 1.0, 0.9759036144578314, 0.978021978021978, 1.0, 0.9864864864

In [21]:
scale(array)
five_fold(array)

for i in range(5):
    # Classify the test samples using k-nn with the standard approach and scaling
    accuracies, precisions, recalls = knn(X_trains[i] ,  y_trains[i] , X_tests[i] , y_tests[i], k_array, True)
    print1(accuracies, precisions, recalls)

Accuracies: 
0.955 0.9733333333333334 0.9783333333333334 0.98 0.9833333333333333 

Precisions: 
[0.9487179487179487, 0.9240506329113924, 0.9473684210526315, 0.9642857142857143, 0.987012987012987, 0.9438202247191011, 0.96, 0.9726027397260274, 0.9866666666666667, 0.9538461538461539, 0.9873417721518988, 0.9466666666666667, 0.948051948051948, 0.9452054794520548, 0.8846153846153846, 0.9848484848484849]
[0.9390243902439024, 0.974025974025974, 0.9642857142857143, 0.9875, 0.9876543209876543, 0.9775280898876404, 1.0, 0.9861111111111112, 0.9868421052631579, 0.9558823529411765, 0.9876543209876543, 0.9342105263157895, 1.0, 0.972972972972973, 0.9240506329113924, 1.0]
[0.9390243902439024, 0.974025974025974, 0.9818181818181818, 0.9875, 0.975609756097561, 0.9777777777777777, 1.0, 0.9859154929577465, 0.9868421052631579, 0.9705882352941176, 0.9876543209876543, 0.9726027397260274, 1.0, 0.9864864864864865, 0.9367088607594937, 1.0]
[0.9625, 0.9868421052631579, 0.9818181818181818, 1.0, 0.963855421686747, 0.

In [22]:
for i in range(5):
    # Classify the test samples using k-nn with the standard approach and scaling
    accuracies, precisions, recalls = knn(X_trains[i] ,  y_trains[i] , X_tests[i] , y_tests[i], k_array, False)
    print1(accuracies, precisions, recalls)

Accuracies: 
0.955 0.9758333333333333 0.9808333333333333 0.9816666666666667 0.9833333333333333 

Precisions: 
[0.9487179487179487, 0.9240506329113924, 0.9473684210526315, 0.9642857142857143, 0.987012987012987, 0.9438202247191011, 0.96, 0.9726027397260274, 0.9866666666666667, 0.9538461538461539, 0.9873417721518988, 0.9466666666666667, 0.948051948051948, 0.9452054794520548, 0.8846153846153846, 0.9848484848484849]
[0.9506172839506173, 0.9866666666666667, 0.9642857142857143, 1.0, 0.9876543209876543, 0.9666666666666667, 1.0, 0.9864864864864865, 1.0, 0.9558823529411765, 0.9876543209876543, 0.9594594594594594, 0.972972972972973, 0.972972972972973, 0.9240506329113924, 1.0]
[0.9625, 0.9868421052631579, 0.9818181818181818, 1.0, 0.975609756097561, 0.9775280898876404, 0.9864864864864865, 0.9861111111111112, 0.9868421052631579, 0.9705882352941176, 0.9876543209876543, 0.9861111111111112, 0.9864864864864865, 0.9864864864864865, 0.9367088607594937, 1.0]
[0.9746835443037974, 0.9868421052631579, 0.98181