In [1]:
import pandas as pd
import sklearn
import numpy as np
import tensorflow as tf

In [2]:
df = pd.read_csv("mammographic_masses.data.txt", header=None)
df.columns=["BI_RADS", "age", "shape", "margin", "density", "severity"]
df

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5,67,3,5,3,1
1,4,43,1,1,?,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,?,1
...,...,...,...,...,...,...
956,4,47,2,1,3,0
957,4,56,4,5,3,1
958,4,64,4,5,3,0
959,5,66,4,5,3,1


In [3]:
df = df.replace("?", np.nan)
df

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5,67,3,5,3,1
1,4,43,1,1,,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,,1
...,...,...,...,...,...,...
956,4,47,2,1,3,0
957,4,56,4,5,3,1
958,4,64,4,5,3,0
959,5,66,4,5,3,1


In [4]:
params = df['density'].isnull() | df['margin'].isnull() | df['density'].isnull() | df['severity'].isnull()
dl = df[params]
dl

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
1,4,43,1,1,,1
4,5,74,1,5,,1
5,4,65,1,,3,0
6,4,70,,,3,0
7,5,42,1,,3,0
...,...,...,...,...,...,...
691,4,72,3,,3,0
723,4,60,3,,4,0
745,6,76,3,,3,0
752,5,48,,4,,1


In [5]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5,67,3,5,3,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,57,1,5,3,1
4,5,76,1,4,3,1
...,...,...,...,...,...,...
825,4,47,2,1,3,0
826,4,56,4,5,3,1
827,4,64,4,5,3,0
828,5,66,4,5,3,1


In [6]:
X = np.array(df[["age", "shape", "margin", "density"]]).astype(np.float)
y = np.array(df['severity']).astype(np.int)
labels = ['positive', 'negative']

In [7]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [8]:
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.25)

In [9]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(trainX, trainY)
clf.score(testX, testY)

0.7451923076923077

In [10]:
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
import pydotplus

dot = tree.export_graphviz(clf, out_file=None, feature_names=["age", "shape", "margin", "density"])
graph = pydotplus.graph_from_dot_data(dot)
Image(graph.create_png())



InvocationException: GraphViz's executables not found

In [17]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=10)
print(scores.mean())
scores

0.7349460560010075


array([0.71428571, 0.73809524, 0.76190476, 0.73493976, 0.78313253,
       0.6746988 , 0.72289157, 0.76829268, 0.75609756, 0.69512195])

In [21]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, class_weight='balanced')
scores = cross_val_score(clf, X, y, cv=10)
print(scores.mean())
scores

0.745833508249024


array([0.70238095, 0.71428571, 0.79761905, 0.72289157, 0.80722892,
       0.69879518, 0.75903614, 0.76829268, 0.7804878 , 0.70731707])

In [24]:
from sklearn import svm

clf = svm.SVC(kernel="linear")
scores = cross_val_score(clf, X, y, cv=10)
print(scores.mean())
scores

0.7964988875362076


array([0.71428571, 0.77380952, 0.86904762, 0.80722892, 0.84337349,
       0.69879518, 0.80722892, 0.80487805, 0.90243902, 0.74390244])

In [25]:
from sklearn import neighbors

clf = neighbors.KNeighborsClassifier(n_neighbors=10)
scores = cross_val_score(clf, X, y, cv=10)
print(scores.mean())
scores

0.7854795488574507


array([0.77380952, 0.76190476, 0.83333333, 0.74698795, 0.87951807,
       0.72289157, 0.81927711, 0.79268293, 0.81707317, 0.70731707])

In [27]:
best_K = [0, 0]

for i in range(1, 50):
    clf = neighbors.KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(clf, X, y, cv=10)
    
    if scores.mean() > best_K[1]:
        best_K = [i, scores.mean()]
        
    print(scores.mean())
    
print(f'\n\n BEST K FOR: {best_K[0]}, {best_K[1]}')

0.7239123742356184
0.6889838098036746
0.7541080699103032
0.7300813008130081
0.7735464506108056
0.7626163189342738
0.7940595133145824
0.7747082406280172
0.7880200243482641
0.7854795488574507
0.7915333809104012
0.7794257168045002
0.7819084701174035
0.7915039950743742
0.7878748443250353
0.7794411093852764
0.7818073688482151
0.775681121699341
0.7805147418944068
0.7828666582707136
0.7853927906748946
0.7817342540895289
0.7805588206484475
0.780587506821712
0.7878171221471251
0.7866269957880302
0.7854365195975539
0.7902271105327232
0.7865979597833844
0.7878314652337574
0.7914172368918182
0.7878314652337574
0.7865976099520032
0.7866119530386354
0.7866262961252677
0.7854358199347914
0.7866843681345592
0.7866553321299133
0.7878891874116676
0.7854791990260694
0.7854645061080558
0.7818500482767305
0.7830692106404713
0.783054867553839
0.783054867553839
0.7854648559394373
0.7866843681345591
0.7890653205155116
0.7902995256286471


 BEST K FOR: 7, 0.7940595133145824
