In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer


In [2]:
masses_data = pd.read_csv('mammographic_masses.data.txt', na_values=['?'], names = ['BI-RADS', 'age', 'shape', 'margin', 'density', 'severity'])
masses_data.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [3]:
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [4]:
masses_data.loc[(masses_data['age'].isnull()) |
              (masses_data['shape'].isnull()) |
              (masses_data['margin'].isnull()) |
              (masses_data['density'].isnull())]

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
...,...,...,...,...,...,...
778,4.0,60.0,,4.0,3.0,0
819,4.0,35.0,3.0,,2.0,0
824,6.0,40.0,,3.0,4.0,1
884,5.0,,4.0,4.0,3.0,1


In [5]:
masses_data.dropna(inplace=True)
masses_data.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [6]:
all_features = masses_data[['age', 'shape',
                             'margin', 'density' , 'severity']]



feature_names = ['age', 'shape', 'margin', 'density']

all_features.head()


Unnamed: 0,age,shape,margin,density,severity
0,67.0,3.0,5.0,3.0,1
2,58.0,4.0,5.0,3.0,1
3,28.0,1.0,1.0,3.0,0
8,57.0,1.0,5.0,3.0,1
10,76.0,1.0,4.0,3.0,1


# Now, we want best k-value for imputation using Severity as a target value

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rmse = lambda y, yhat: np.sqrt(mean_squared_error(y, yhat))

In [8]:
def optimize_k(data, target):
    errors = []
    for k in range(1, 20, 2):
        imputer = KNNImputer(n_neighbors=k)
        imputed = imputer.fit_transform(data)
        df_imputed = pd.DataFrame(imputed, columns=data.columns)
        
        X = df_imputed.drop(target, axis=1)
        y = df_imputed[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = rmse(y_test, preds)
        errors.append({'K': k, 'RMSE': error})
        
    return errors

In [9]:
k_errors = optimize_k(data=all_features, target='severity')

In [10]:
k_errors

[{'K': 1, 'RMSE': 0.4067693241903559},
 {'K': 3, 'RMSE': 0.4088236337764397},
 {'K': 5, 'RMSE': 0.40801620677998374},
 {'K': 7, 'RMSE': 0.41306260058855354},
 {'K': 9, 'RMSE': 0.4071006872578334},
 {'K': 11, 'RMSE': 0.4124152283544194},
 {'K': 13, 'RMSE': 0.40887166007685},
 {'K': 15, 'RMSE': 0.4086359606591112},
 {'K': 17, 'RMSE': 0.40684920742002706},
 {'K': 19, 'RMSE': 0.40888352342626466}]

Using Target Variable as Severity, K_value = '17' seems optimal value in the given range.

In [11]:
imputer = KNNImputer(n_neighbors=17)
imputed = imputer.fit_transform(all_features)
data_imputed = pd.DataFrame(imputed, columns=all_features.columns)

In [12]:
data_imputed.isna().sum()

age         0
shape       0
margin      0
density     0
severity    0
dtype: int64

In [13]:
data_imputed.head()

Unnamed: 0,age,shape,margin,density,severity
0,67.0,3.0,5.0,3.0,1.0
1,58.0,4.0,5.0,3.0,1.0
2,28.0,1.0,1.0,3.0,0.0
3,57.0,1.0,5.0,3.0,1.0
4,76.0,1.0,4.0,3.0,1.0


In [14]:
data_imputed.to_csv('Imputed_for_knn.csv')

In [17]:
import csv
import random
import math
import operator

def loadDataset(fname, split, trainingSet=[], testSet=[]):
    with open(fname, "rt") as csvfile:
        lines = csv.reader(csvfile)
        next(lines)
        dataset = list(lines)
        for x in range(len(dataset)-1):
            for y in range(6):
                dataset[x][y] = float(dataset[x][y])

            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])


# euclidean distance calculation
def euclideanDist(in1, in2, length):
    distance = 0
    for x in range(length):
        distance += pow((in1[x] - in2[x]), 2)

    return math.sqrt(distance)


# Neighbours
def getNeighbors(trainingSet, testIn, k):
    distance = []
    length = len(testIn)-1
    for x in range(len(trainingSet)):
        dist = euclideanDist(testIn, trainingSet[x], length)
        distance.append((trainingSet[x], dist))

    distance.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distance[x][0])

    return neighbors

def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1

    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

def getAcuuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1

    return correct/float(len(testSet))*100



for x in range(len(dataset)-1):

In [19]:
trainingSet = []
testSet = []
split = 0.67

loadDataset('Imputed_for_knn.csv', split, trainingSet, testSet)
print('Train set: ' + repr(len(trainingSet)))
print('Test Set: ' + repr(len(testSet)))

# Generate Predictions
predictions = []
k = 3
for x in range(len(testSet)):
    neighbors = getNeighbors(trainingSet, testSet[x], k)
    res = getResponse(neighbors)
    predictions.append(res)

    print("predicted = " + repr(res) + ", Actual = " + repr(testSet[x][-1]))

accuracy = getAcuuracy(testSet, predictions)
print("Accuracy: " + repr(accuracy) + " %")

Train set: 557
Test Set: 272
predicted = 1.0, Actual = 1.0
predicted = 1.0, Actual = 0.0
predicted = 1.0, Actual = 1.0
predicted = 1.0, Actual = 0.0
predicted = 1.0, Actual = 1.0
predicted = 1.0, Actual = 0.0
predicted = 0.0, Actual = 0.0
predicted = 1.0, Actual = 0.0
predicted = 0.0, Actual = 1.0
predicted = 0.0, Actual = 0.0
predicted = 0.0, Actual = 1.0
predicted = 0.0, Actual = 0.0
predicted = 0.0, Actual = 1.0
predicted = 0.0, Actual = 0.0
predicted = 1.0, Actual = 1.0
predicted = 1.0, Actual = 1.0
predicted = 0.0, Actual = 0.0
predicted = 0.0, Actual = 0.0
predicted = 0.0, Actual = 1.0
predicted = 0.0, Actual = 0.0
predicted = 1.0, Actual = 1.0
predicted = 1.0, Actual = 0.0
predicted = 1.0, Actual = 1.0
predicted = 0.0, Actual = 1.0
predicted = 1.0, Actual = 1.0
predicted = 1.0, Actual = 0.0
predicted = 0.0, Actual = 0.0
predicted = 0.0, Actual = 0.0
predicted = 1.0, Actual = 1.0
predicted = 0.0, Actual = 1.0
predicted = 0.0, Actual = 0.0
predicted = 0.0, Actual = 0.0
predicted =