In [1]:
#The main idea of the project is to identify the best ML method: Decision Trees, Random Forest, KNN, Naive Base, Support Vector Machines, and Logistic Regression -  for recoginizing whether the mammogram mass is benign or malignant,
#based on "mammographic masses" public dataset from the UCI repository (source: https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass)
import pandas as pd
import numpy as np
input_file="mammographic_masses.data.txt"
col_names=['BI_RADS', 'age', 'shape', 'margin', 'density', 'severity']
df=pd.read_csv(input_file, na_values=['?'], names=col_names)
df=df.replace('?', np.NaN)
df.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [3]:
df.describe()
#there sre some missed values, so we have to clean up the data

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [4]:
df.loc[(df['age'].isnull()) |
        (df['shape'].isnull()) |
        (df['margin'].isnull())|
        (df['density'].isnull())]

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
...,...,...,...,...,...,...
778,4.0,60.0,,4.0,3.0,0
819,4.0,35.0,3.0,,2.0,0
824,6.0,40.0,,3.0,4.0,1
884,5.0,,4.0,4.0,3.0,1


In [5]:
#Since missing data is randomel distributed, we may just drop those rows without some data values
df.dropna(inplace=True)
df.describe()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [6]:
#BI-RADS is an assesment of how confident the severity classification is; it is not a "predictive" attribute and so we will discard it. 
features=df[['age', 'shape','margin', 'density']].values
classes=df['severity'].values
name_lables=['age', 'shape','margin', 'density']

In [7]:
#For some of the further used techniques we'll need to have normalized data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled= scaler.fit_transform(features)
features_scaled


array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [8]:
#Decision Trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

np.random.seed(1234)

(features_train, features_test, classes_train, classes_test)= train_test_split(features_scaled, classes, test_size=0.75, random_state=1)
clf = DecisionTreeClassifier(random_state=1)
clf = clf.fit(features_train, classes_train)
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=1)
cv_scores = cross_val_score(clf, features_scaled, classes, cv=10)

cv_scores.mean() 

0.7373493975903613

In [9]:
# Random Forest 
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=20, random_state=1)
cv_scores = cross_val_score(clf, features_scaled, classes, cv=10)

cv_scores.mean()

0.7566265060240964

In [10]:
#Support Vector Machine
#With linear kernel
from sklearn import svm, datasets
C = 1.0
svc = svm.SVC(kernel='linear', C=C).fit(features_scaled, classes)
cv_scores = cross_val_score(svc, features_scaled, classes, cv=10)
cv_scores.mean()


0.7975903614457832

In [24]:
#SVM with rbf
from sklearn import svm, datasets
C = 1.0
svc = svm.SVC(kernel='rbf', C=C).fit(features_scaled, classes)
cv_scores = cross_val_score(svc, features_scaled, classes, cv=10)
cv_scores.mean()

0.8012048192771084

In [26]:
#SVM with sigmoid kernel
from sklearn import svm, datasets
C = 1.0
svc = svm.SVC(kernel='sigmoid', C=C).fit(features_scaled, classes)
cv_scores = cross_val_score(svc, features_scaled, classes, cv=10)
cv_scores.mean()

0.7457831325301204

In [27]:
#SVM with polynomial kernel
from sklearn import svm, datasets
C = 1.0
svc = svm.SVC(kernel='poly', C=C).fit(features_scaled, classes)
cv_scores = cross_val_score(svc, features_scaled, classes, cv=10)
cv_scores.mean()

0.7903614457831326

In [21]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(features_scaled, classes)
scores=[]
for i in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(features_scaled, classes)
    cv_scores = cross_val_score(knn, features_scaled, classes, cv=10)
    scores.append(cv_scores.mean())
index_max = np.argmax(scores)
print(index_max+1, max(scores))



7 0.7951807228915662


In [23]:
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features_minmax = scaler.fit_transform(features)

clf = MultinomialNB()
cv_scores = cross_val_score(clf, features_minmax, classes, cv=10)

cv_scores.mean()

0.7855421686746988