In [29]:
import numpy as np

# Step 1: Function to load data from file and process it into a matrix and a list of labels
def file2matrix(filename):
    # Open the file
    fr = open(filename)
    # Count the number of lines in the file to determine the size of the matrix
    numberOfLines = len(fr.readlines())
    # Initialize a matrix to hold the feature data (3 features per sample) 
    returnMat = np.zeros((numberOfLines, 3))
    # Initialize a list to hold class labels (e.g., 'didntLike', 'smallDoses', 'largeDoses')
    classLabelVector = []
    # Re-open file to start reading data from the beginning
    fr = open(filename)
    # Initialize index for each row of the matrix
    index = 0
    for line in fr.readlines():
        # Remove any leading/trailing whitespaces
        line = line.strip()
        # Split each line by tab ('\t') to separate features and label
        listFromLine = line.split('\t')
        # Store the first three items as features in the matrix
        returnMat[index, :] = listFromLine[0:3]
        # Store the last item as a label in the list
        classLabelVector.append(listFromLine[-1])
        # Increment the index for the next row
        index += 1
    return returnMat, classLabelVector

# Step 2: Function to normalize the dataset
def autoNorm(dataSet):
    # Calculate the minimum values for each feature column
    minVals = dataSet.min(0)
    # Calculate the maximum values for each feature column
    maxVals = dataSet.max(0)
    # Calculate the range (max - min) for each feature column
    ranges = maxVals - minVals
    # Create an array of zeros with the same shape as the input dataset
    normDataSet = np.zeros(np.shape(dataSet))
    # Get the number of rows in the dataset
    m = dataSet.shape[0]
    # Subtract the minimum values from the dataset and scale by dividing by the range
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    normDataSet = normDataSet / np.tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

# Step 3: k-Nearest Neighbors algorithm for classification
def classify0(inX, dataSet, labels, k):
    # Calculate the number of samples in the dataset
    dataSetSize = dataSet.shape[0]
    # Calculate the difference matrix between inX and each sample in the dataset
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    # Square the difference matrix
    sqDiffMat = diffMat**2
    # Sum the squared differences for each row to get squared Euclidean distances
    sqDistances = sqDiffMat.sum(axis=1)
    # Calculate the square root of each element to get the Euclidean distances
    distances = sqDistances**0.5
    # Get the indices of the distances in ascending order
    sortedDistIndices = distances.argsort()
    # Dictionary to count the occurrence of each label among the k nearest neighbors
    classCount = {}
    for i in range(k):
        # Find the label of the i-th nearest neighbor
        voteIlabel = labels[sortedDistIndices[i]]
        # Count the label in the dictionary
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    # Sort the class count dictionary by occurrence in descending order
    sortedClassCount = sorted(classCount.items(), key=lambda item: item[1], reverse=True)
    # Return the label with the highest count (most common among neighbors)
    return sortedClassCount[0][0]

# Step 4: Function to test classifier accuracy
def datingClassTest():
    # Set hold-out ratio for testing (10%)
    hoRatio = 0.10
    # Load and process the data file
    datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
    # Normalize the feature matrix
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # Get the total number of samples
    m = normMat.shape[0]
    # Calculate the number of samples to use as the test set
    numTestVecs = int(m * hoRatio)
    # Initialize error count to track misclassifications
    errorCount = 0.0
    # Loop over each test sample
    for i in range(numTestVecs):
        # Classify the test sample and get the predicted label
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        # Print the predicted and actual label for the test sample
        print(f"the classifier came back with: {classifierResult}, the real answer is: {datingLabels[i]}")
        # Increment error count if the prediction is incorrect
        if classifierResult != datingLabels[i]:
            errorCount += 1.0
    # Print the total error rate (errors / test samples)
    print(f"the total error rate is: {errorCount / float(numTestVecs)}")

# Step 5: Function to predict the match type for a new person based on input features
def classifyPerson():
    # Labels list for possible match outcomes
    resultList = ['didntLike', 'smallDoses', 'largeDoses']
    # Prompt the user to enter the percentage of time spent playing video games
    percentTats = float(input("percentage of time spent playing video games? "))
    # Prompt the user to enter the number of frequent flyer miles earned per year
    ffMiles = float(input("frequent flier miles earned per year? "))
    # Prompt the user to enter the liters of ice cream consumed per year
    iceCream = float(input("liters of ice cream consumed per year? "))
    # Load and process the dating data file
    datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
    # Normalize the dataset (to get ranges and minVals)
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # Create an array for the new input features
    inArr = np.array([ffMiles, percentTats, iceCream])
    # Normalize the input array using the training data's minVals and ranges
    classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
    # Print the prediction (i.e., the likely match type)
    print("You will probably like this person:", classifierResult)

# Step 6: Load and normalize data, test classifier accuracy, and classify a new person
# Uncomment below to test functions interactively:
datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
datingClassTest()
classifyPerson()


the classifier came back with: largeDoses, the real answer is: largeDoses
the classifier came back with: smallDoses, the real answer is: smallDoses
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: largeDoses, the real answer is: largeDoses
the classifier came back with: largeDoses, the real answer is: largeDoses
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: largeDoses, the real answer is: largeDoses
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: didntLike, the real answer is: didntLike
the classifier came back with: smallDoses, the real answer is: smallDoses
the classifier came back with: didntLike, the real a

percentage of time spent playing video games?  10
frequent flier miles earned per year?  10000
liters of ice cream consumed per year?  0.5


You will probably like this person: smallDoses


In [5]:
datingDataMat

array([[4.0920000e+04, 8.3269760e+00, 9.5395200e-01],
       [1.4488000e+04, 7.1534690e+00, 1.6739040e+00],
       [2.6052000e+04, 1.4418710e+00, 8.0512400e-01],
       ...,
       [2.6575000e+04, 1.0650102e+01, 8.6662700e-01],
       [4.8111000e+04, 9.1345280e+00, 7.2804500e-01],
       [4.3757000e+04, 7.8826010e+00, 1.3324460e+00]])

In [None]:
#small dose 10,10000,0.5
#large dose 10 50000, 0.4
#didn't like it 13 75000 1.2
