#Multimodal Approach 


> This solution includes the following:

*   Data extraction and cleaning
*   Preprocessing
*   Decision Trees
*   Support Vector Machines
*   K Nearest Neighbors
*   Naives Bayes








#Data Extraction
Upload the mammographic data with proper column names and NaN for missing values


In [106]:
import pandas as pd
import numpy as np

columns = ['BI-RADS', 'age' , 'shape', 'margin', 'density', 'severity']

df = pd.read_csv('/content/drive/My Drive/Machine Learning/skillshare project/mammographic_masses.data.txt', names=columns, na_values ='?' )
df.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [107]:
df.corrwith(df['severity'])

BI-RADS     0.231601
age         0.432066
shape       0.563308
margin      0.574919
density     0.064010
severity    1.000000
dtype: float64

In [108]:
df.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


#Data cleaning
Removing the fields with missing values

In [109]:
df = df.dropna()
df.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


#Preprocessing
Identification of features and targets.

We remove BI-RADS as it is a non-predictive feature

In [0]:
feature_names = ['age', 'shape', 'margin', 'density']

features = np.array(df[feature_names])
target = np.array(df['severity'])

Normalising the ranges of features to standard scales

In [111]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

Splitting the data between test and train 
test/train ratio = 0.25

In [0]:
from sklearn.model_selection import train_test_split

train_X, test_X , train_Y, test_Y = train_test_split(scaled_features, target, test_size = 0.25, random_state=1 )

#Decision Trees Classifier

In [113]:
from sklearn.tree import DecisionTreeClassifier as DTC

clf= DTC(random_state=1)
clf.fit(train_X, train_Y)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [114]:
clf.score(test_X,test_Y)

0.7355769230769231

K fold cross validation

(k =10; given in ques)

In [115]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier as DTC

clf= DTC(random_state=1)

scores = cross_val_score(clf, scaled_features, target, cv=10)

print(scores.mean())

0.7373493975903613


Random Forrest Classifier (trees = 50)

In [116]:
from sklearn.ensemble import RandomForestClassifier as RFC

clf =  RFC(n_estimators=50, random_state=1)
scores = cross_val_score(clf, scaled_features, target, cv=10)

print(scores.mean())


0.7614457831325301


The highest mean score is 0.7614 using Decision Trees

#Support Vector Machines
with linear kernel

In [117]:
import sklearn.svm as svm

svc = svm.SVC(kernel='linear', C=1.0)

scores = cross_val_score(svc, scaled_features, target, cv=10)
print(scores.mean())

0.7975903614457832


with polynomial kernel

In [118]:
import sklearn.svm as svm

svc = svm.SVC(kernel='poly', C=1.0)

scores = cross_val_score(svc, scaled_features, target, cv=10)
print(scores.mean())

0.7903614457831326


with rbf kernel

In [119]:
import sklearn.svm as svm

svc = svm.SVC(kernel='rbf', C=1.0)

scores = cross_val_score(svc, scaled_features, target, cv=10)
print(scores.mean())

0.8012048192771084


with sigmoid kernel

In [120]:
import sklearn.svm as svm

svc = svm.SVC(kernel='sigmoid', C=1.0)

scores = cross_val_score(svc, scaled_features, target, cv=10)
print(scores.mean())

0.7457831325301204


The highest score is with rbf kernel of 0.8012

#K Nearest Neighbours
k=10

In [121]:
from sklearn.neighbors import KNeighborsClassifier as KNC

clf = KNC(n_neighbors=10)
scores = cross_val_score(clf, scaled_features, target, cv=10)

scores.mean()

0.7915662650602409

In [122]:
#for loop to get best value of k
s=[]
for i in range(1, 50):
    clf = KNC(n_neighbors=i)
    scores = cross_val_score(clf, scaled_features, target, cv=10)
    s.append(round(scores.mean(), 4))
print(max(s)) 
print(s.index(max(s)))  

0.7952
6


In [123]:
#index 6 means 7th element

clf = KNC(n_neighbors=7)
scores = cross_val_score(clf, scaled_features, target, cv=10)

scores.mean()

0.7951807228915662

The highest score is with neighbors (k) at 7 of 0.7952



#Naive Bayes

In [124]:
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
features_minmax = scaler.fit_transform(features)

clf = MNB()
scores = cross_val_score(clf, features_minmax, target, cv=10)

scores.mean()

0.7855421686746988

Naives Bayes gives mean score of 0.7855

#The clear winner is Support Vector Machines with rbf kernel and highest mean score of 0.8012