In [1]:
import math

import numpy as np
import pandas as pd

In [2]:
def find_maxes(array):
    """
    This function takes in a 3D array of intensity values for pixels in an AFM micrograph and returns the maximum
    value for each of the scan types as a vector
    
    input: numpy ndarray of 3 dimensions containing AFM scan data
    output: numpy array of 1 dimension containing the maximum value of each scan type
    """
    x, y, z = array.shape
    maxVec = np.empty(z)
    
    #initialize the maxVec to contain the 1st pixel's values
    for h in range(z):
        maxVec[h] = array[0, 0, h]
    
    #look for the maximum value for each scan type
    for i in range(z):
        for j in range(x):
            for k in range(y):
                if maxVec[i, j, k] < array[i, j, k]:
                    maxVec[i, j, k] = array[i, k, j]
                elif maxVec[i, j, k] > array[i, j, k]:
                    pass
                else:
                    pass
    
    return maxVec

In [3]:
def euc_dist(maxVec, pixel1, pixel2):
    """
    This function takes in a vector of maximum values for the sample's different data types and two pixels as
    vectors of their features and calculates the Euclidean distance. It then normalizes these differences for 
    each scan type so there isn't uneven weighting for a given feature type. Finally, it returns the adjusted 
    Euclidean distance.
    
    This function assumes that only numeric data is in the pixel features.
    
    inputs: a 1D numpy array containing the maximum values of a given sample's scan types
            a 1D numpy array containing the feature values of the pixel being examined
            a 1D numpy array containing the feature values of the neighboring pixel
    
    outputs: a numeric value describing the normalized euclidean distance between the pixel and its neighbor
    """
    dist_sqrd = 0
    
    #calculate the normalized square of the euclidean distance. 
    for i in range(len(pixel1 + 1)):
        diff = 0
        diff = (pixel1[i] - pixel2[i])/maxVec[i]
        
        dist_sqrd += diff
    
    dist = math.sqrt(abs(dist_sqrd))
    
    return dist

In [4]:
def neighbor_locater(x, y):
    """
    This function takes in the xy location of a pixel in a 3D array of information and locates its neighbors, 
    returning these locations as a 2D array of 8 different xy coordinates.
    
    input: x - The row of pixels that contains the pixel in question
           y - The column of pixels that contains the pixel in question
    
    output: neighbor_locations - a 2D array containing the xy location of the 8 nearest neighbors
    """
    
    neighbors = np.empty([8, 2])
    
    neighbors[0, 0] = x - 1
    neighbors[0, 1] = y - 1
    
    neighbors[1, 0] = x - 1
    neighbors[1, 1] = y
    
    neighbors[2, 0] = x - 1
    neighbors[2, 1] = y + 1
    
    neighbors[3, 0] = x
    neighbors[3, 1] = y - 1
    
    neighbors[4, 0] = x
    neighbors[4, 1] = y + 1
    
    neighbors[5, 0] = x + 1
    neighbors[5, 1] = y - 1
    
    neighbors[6, 0] = x + 1
    neighbors[6, 1] = y
    
    neighbors[7, 0] = x + 1
    neighbors[7, 1] = y + 1
    
    return neighbor_locations

In [5]:
def neighbor_properties(scan, neighbor_locations):
    """
    This function takes in a 3D np.ndarray of AFM data and a 2D np.array of pixel xy locations. These pixels are the
    8 neighbors of a given pixel. It returns a 3D np.ndarray of the 8 neighbors' AFM values
    
    inputs: scan - a 3D np.ndarray with 5 mechanical properties and classifications (1 or more) of an xy array of
                    pixels.
            neighbor_locations - a 2D np.array of the xy locations of pixels surrounding a given pixel
            
    outputs: neighbor_props - a 2D np.array of the mechanical properties and classifications of a given pixel's
                    8 neighboring pixels
    """
    x, y, z = scan.shape
    
    neighbor_props = np.empty([8, z])
    
    for i in range(8):
        xx, yy = neighbor_locations[i]
        for j in range(z):
            neighbor_props[i, j] = scan[xx, yy, j]
    
    return neighbor_props

In [7]:
def euclidean_classifier_pixel(pixel, neighbor_props, maxVec):
    """
    This function takes in a pixel as a vector of the scan types of a nanomechanical mapping AFM micrograph.
    Using a normalized euclidean distance, the similarity of the pixel in question with its neighboring pixels
    is factored into its own classification.
    
    inputs: pixel - a 1D np.array containing its mechanical properties (0-4) and its own classification (5) and any
                    previous classifications (6+)
            neighbors - a 2D np.array containing the mechanical properties of the 8 neighbors of the pixel in question
            maxVec - a 1D np.array containing the maximum values for each scan type within the whole scan
    
    outputs: euc_classified_pixel - a 1D np.array containing the same 6 previous properties, the old classification, 
                plus this new classification
    """
    
    neighbor_distances = np.empty([8, 6])
    
    
    
    
    for i in range(8):
        for j in range(5):
            neighbor_distances[i, j] = euc_dist(maxVec, pixel, neighbor_props[i, j])
    
    similar_neighbor_classifications = 0 
    
    for i in range(8):
        total_distance = 0
        
        for j in range(5):
            total_distance += neighbor_distances[i, j]
        
        if (total_distance/5) <= 0.3:       #assume that there can be an overall 30% fluctuation pixel-to-pixel
            similar_neighbor_classifications += 1
            neighbor_distances[i, 5] = 1    #flag the similar pixels for later
        else:
            neighbor_distances[i, 5] = 0
            
    if similar_neighbor_classifications >=3: #if at least 3 neighbors are similar, the pixel is not abberant
        pass
    else: #if the pixel is abberant, it is replaced with the average of its neighbors and then classified
        
        
        
    return euc_classified_pixel

IndentationError: expected an indented block (<ipython-input-7-647918ec08d1>, line 41)

In [None]:
def euclidean_classifier_scan(sampleName):
    """
    This function takes in the name of the sample, retrieves the file, and then calls other functions to complete a
    classification of each pixel in the scan that incorporates its 8 nearest neighbors' euclidean distance. The edge
    pixels are excluded from classification, but included as neighbors, for increased accuracy.
    
    inputs: sampleName - a string of the sample's name, excluding the filetype extension. It is assumed that all files
                            are .txt filetype
    outputs: euc_classified_scan - a 3D np.ndarray that contains the aggregated AFM scans, any previous
                            classifications, and the euclidean classification.
                    *also writes a 2D .txt file containing all of the above information
    """
    
    scan = np.loadtxt('../Data/AFM/AggregatedData/%s.txt'% (sampleName))
    
    x2, z = scan.shape
    x = y = int(math.sqrt(x2))
    scan = scan.reshape(x, y, z)
    
    euc_classified_scan = np.empty(x, y)
    
    maxVec = find_maxes(scan)
    
    for i in range(1, x-1):
        for j in range(1, y-1):
            neighbors = neighbor_locater(i, j)
            neighbor_props = neighbor_properties(scan, neighbors)
            
            euc_classified_scan[i][j] = euclidean_classifier_pixel(scan[i][j], neighbor_props, maxVec)
    
    scan_and_classif = np.dstack((scan, euc_classified_scan))
    
    scan_and_classif.reshape(x2, z+1)
    
    np.savetxt('../Data/AFM/AggregatedData/%s.txt'%(sampleName), two_dim_aggr)
    
    return euc_classified_scan

## Beyond this point are the remaining parts of the knn classifier that I created for the SEDS Homework and haven't used above

In [None]:
def neighbor_ident(database, inputdata):
    """Takes in a dataframe and locates each entry's k nearest neighbors in the archived database"""
    
    dist_matrix = pd.DataFrame() #initialize empty dataframe to hold the distance information
    knns = pd.DataFrame() #initialize empty dataframe to hold the sorted nearest neighbors
    
    for index1, row1 in inputdata.iterrows(): #calculate euclidean distance and save it
        for index2, row2 in database.iterrows():
            dist_matrix.loc[index1, index2] = knn.euc_dist(row1, row2)
        
        knn_index1 = dist_matrix.loc[index1].sort_values() #sort the euclidean distances 
        sorted_neighbors = list(knn_index1.index) #the indexes refer to the neighbors, save these not their dist.
        knns[index1] = sorted_neighbors

    return knns

In [None]:
def class_prediction(inputdata, database, k):
    """Takes in a test matrix, the training matrix, and user choice of k
    then makes a class prediction and returns a series of these predictions"""
    
    k_neighbors = pd.DataFrame()
    
    for i in range(k):
        k_neighbors = k_neighbors.append(knn.neighbor_ident(database, inputdata).loc[i])
    
    levels, labels = pd.factorize(database.Type) #Turns string type class titles and makes unique integers out of them
    y = levels #index of the unique class integers

    predictions = pd.Series(len(inputdata))
    
    for i in range(len(inputdata)): #Determine class of each of the k nearest neighbors
        type1 = 0
        type2 = 0
        type3 = 0
        
        for j in range(k):
            if y[k_neighbors[i][j]] == y[0]:
                type1 += 1
            elif y[k_neighbors[i][j]] == y[5]:
                type2 += 1
            elif y[k_neighbors[i][j]] == y[7]:
                type3 += 1
            else:
                pass

        type_count = {y[0]:type1, y[5]:type2, y[7]:type3} # Classify the testing point based on the majority type
        predictions[i] = max(type_count, key=type_count.get)  # around it

    return predictions

In [None]:
def accuracy_check(k, inputdata):
    """Takes in a k value and testing dataframe. Compares the predicted types of the testing data
        versus their actual types. Returns the number of correct guesses"""
    
    number_correct = 0
    inputdata_classes = pd.Series(len(inputdata))
    
    for i in range(len(inputdata)): # Makes a series of the actual class types
        if inputdata.loc[i][3] == 'PT':
            inputdata_classes[i] = 0
        elif inputdata.loc[i][3] == 'TM':
            inputdata_classes[i] = 1
        elif inputdata.loc[i][3] == 'Alk':
            inputdata_classes[i] = 2
        else:
            pass
    
    for i in range(len(inputdata)): # Counts the number of predictions that are correct for a given k
        if knn.class_prediction(inputdata, database, k)[i] == inputdata_classes[i]:
            number_correct += 1
        else:
            pass
        
    return number_correct

In [None]:
def k_choice(database, inputdata):
    """Takes in training data and testing data, then returns a dictionary
        of possible k values and their number of correct predictions"""
    
    possible_k = {}
    
    for i in range (1, len(database)+1):
        possible_k[i] = (knn.accuracy_check(i, inputdata)/len(inputdata))*100
        
    return

In [None]:
def knn_interface(database, inputdata, k):
    """A function to handle user input and interface with the kNN functions."""
    
    classifications = pd.Series(len(inputdata))
    predictions = pd.Series(len(inputdata))
    
    for i in range(len(inputdata)):
        predictions[i] = knn.class_prediction(inputdata, database, k)[i]
        
    for i in range(len(inputdata)):
        if predictions[i] == 0:
            classifications[i] = 'PT'
        elif predictions[i]== 1:
            classifications[i] = 'TM'
        elif predictions[i] == 2:
            classifications[i] = 'Alk'
        else:
            classifications[i] = 'Unknown'
    
    return classifications