In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', names = ["Class", "AGE GP", "menopause", "tumor-size", "inv-nodes", "node-caps", "deg-malig", "breast", "breast-quad", "irradiat"])

In [3]:
df.head()

Unnamed: 0,Class,AGE GP,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [4]:
df = df.replace({'AGE GP' : {'10-19' : 0, '20-29' : 1, '30-39' : 2, '40-49' : 3, '50-59' : 4, '60-69' : 5, '70-79' : 6, '80-89' : 7, '90-99' : 8, '?' : np.NaN}})
df = df.replace({'menopause' : {'lt40' : 0, 'ge40' : 1, 'premeno' : 2, '?' : np.NaN}})
df = df.replace({'tumor-size' : {'0-4' : 0, '5-9' : 1, '10-14' : 2, '15-19' : 3, '20-24' : 4, '25-29' : 5, '30-34' : 6, '35-39' : 7, '40-44' : 8, '45-49' : 9, '50-54' : 10, '55-59' : 11, '?' : np.NaN}})
df = df.replace({'inv-nodes' : {'0-2' : 0, '3-5' : 1, '6-8' : 2, '9-11' : 3, '12-14' : 4, '15-17' : 5, '18-20' : 6, '21-23' : 7, '24-26' : 8, '27-29' : 9, '30-32' : 10, '33-35' : 11, '36-39' : 12, '?' : np.NaN}})
df = df.replace({'node-caps' : {'yes' : 1, 'no' : 0, '?' : np.NaN}})
df = df.replace({'breast' : {'left' : 0, 'right' : 1, '?' : np.NaN}})
df = df.replace({'breast-quad' : {'left_up' : 0, 'left_low' : 1, 'right_up' : 2, 'right_low' : 3, 'central' : 4, '?' : np.NaN}})
df = df.replace({'irradiat' : {'no' : 0, 'yes' : 1, '?' : np.NaN}})
df = df.replace({'Class' : {'no-recurrence-events' : 0, 'recurrence-events' : 1, '?' : np.NaN}})

In [5]:
df.head()

Unnamed: 0,Class,AGE GP,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,2,2,6,0,0.0,3,0,1.0,0
1,0,3,2,4,0,0.0,2,1,2.0,0
2,0,3,2,4,0,0.0,2,0,1.0,0
3,0,5,1,3,0,0.0,2,1,0.0,0
4,0,3,2,0,0,0.0,2,1,3.0,0


In [6]:
df = df[pd.notnull(df['node-caps'])]

In [7]:
for i in range(1, 10) :
    mini = min(df.iloc[:, i])
    maxi = max(df.iloc[:, i])
    df.iloc[:, i] = (df.iloc[:, i] - mini) / (maxi - mini)
df.head()

Unnamed: 0,Class,AGE GP,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,0.2,1.0,0.6,0.0,0.0,1.0,0.0,0.25,0.0
1,0,0.4,1.0,0.4,0.0,0.0,0.5,1.0,0.5,0.0
2,0,0.4,1.0,0.4,0.0,0.0,0.5,0.0,0.25,0.0
3,0,0.8,0.5,0.3,0.0,0.0,0.5,1.0,0.0,0.0
4,0,0.4,1.0,0.0,0.0,0.0,0.5,1.0,0.75,0.0


In [8]:
train, test = train_test_split(df, test_size = 0.5)

In [9]:
def predict(weights, test) :
    value = 0
    for i in range(len(test) - 1) :
        value += weights[i] * test[i]
        
    if value >= 0:
        return 1
    else:
        return 0

def training(weights, train, l_rate, epochs) :
    for i in range(epochs) :
        for j in range(len(train)) :
            prediction = predict(weights, train.iloc[j])
            loss = train.iloc[j, -1] - prediction
            weights += l_rate * loss * train.iloc[j, :10]

    
    return weights

In [10]:
weights = np.zeros(10)
epochs = 1000
l_rate = 0.1
weights = training(weights, train, l_rate, epochs)
correct = 0
for i in range(len(test)) :
    result = predict(weights, test.iloc[i])
    if result == test.iloc[i, -1] : 
        correct += 1

accuracy = correct / len(test)
print(accuracy)

0.762589928057554
