In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.describe()

In [None]:
X_cancer = df.iloc[:,2:]
X_cancer = X_cancer.drop("Unnamed: 32",axis=1)
y_cancer = df.loc[:,'diagnosis']

In [None]:
X_cancer_mean = X_cancer.iloc[:,:10]
X_cancer_se = X_cancer.iloc[:,10:20]
X_cancer_worst = X_cancer.iloc[:, 20:30]



In [None]:
print("Number of Malignant: " + str(sum(x == 'M' for x in y_cancer)))
print("Number of Benign: " + str(sum(x == 'B' for x in y_cancer)))

ax = sns.countplot(y_cancer, label="Count")


In [None]:
def normalize(data):
    data_return = (data - data.mean()) / (data.std())
    return data_return

In [None]:
X_cancer_norm = normalize(X_cancer)

data = pd.concat([y_cancer,X_cancer_norm.iloc[:,0:10]],axis=1)
data = pd.melt(data, id_vars="diagnosis",
                              var_name="features",value_name='value')
plt.figure(figsize=(9,9))
sns.violinplot(x="features", y="value", data = data, inner="quart",
              split=True, hue="diagnosis")
plt.xticks(rotation=90)



In [None]:
data = pd.concat([y_cancer,X_cancer_norm.iloc[:,10:20]],axis=1)
data = pd.melt(data, id_vars="diagnosis",
                              var_name="features",value_name='value')
plt.figure(figsize=(9,9))
sns.violinplot(x="features", y="value", data = data, inner="quart",
              split=True, hue="diagnosis")
plt.xticks(rotation=90)

In [None]:
data = pd.concat([y_cancer,X_cancer_norm.iloc[:,20:30]],axis=1)
data = pd.melt(data, id_vars="diagnosis",
                              var_name="features",value_name='value')
plt.figure(figsize=(9,9))
sns.violinplot(x="features", y="value", data = data, inner="quart",
              split=True, hue="diagnosis")
plt.xticks(rotation=90)

In [None]:
sns.jointplot(X_cancer.loc[:,'concavity_mean'], X_cancer.loc[:,'concave points_worst'], kind="reg", color="#ce1414")


In [None]:
f, ax = plt.subplots(figsize=(18 ,18))
corr = X_cancer.corr()
sns.heatmap(corr,
    cmap="coolwarm",
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax, annot=True, linewidths=.5, fmt= '.1f')

In [None]:
data = pd.concat([y_cancer,X_cancer_norm.iloc[:,0:10]],axis=1)
data = pd.melt(data, id_vars="diagnosis",
                              var_name="features",value_name='value')
plt.figure(figsize=(9,9))
sns.swarmplot(x="features", y="value", data = data, hue="diagnosis")
plt.xticks(rotation=90)

In [None]:
data = pd.concat([y_cancer,X_cancer_norm.iloc[:,10:20]],axis=1)
data = pd.melt(data, id_vars="diagnosis",
                              var_name="features",value_name='value')
plt.figure(figsize=(9,9))
sns.swarmplot(x="features", y="value", data = data, hue="diagnosis")
plt.xticks(rotation=90)

In [None]:
data = pd.concat([y_cancer,X_cancer_norm.iloc[:,20:30]],axis=1)
data = pd.melt(data, id_vars="diagnosis",
                              var_name="features",value_name='value')
plt.figure(figsize=(9,9))
sns.swarmplot(x="features", y="value", data = data, hue="diagnosis")
plt.xticks(rotation=90)

In [None]:
drop_list = ['radius_mean','perimeter_worst','area_worst','smoothness_mean'
             ,'perimeter_mean','texture_worst','compactness_worst','compactness_mean',
             'concavity_worst','concave points_mean', 'perimeter_se', 'radius_se']
X_filtered = X_cancer.drop(drop_list,axis=1)


In [None]:
plt.figure(figsize=(18,18))
corr = X_filtered.corr()
sns.heatmap(corr, vmin=-1.0, vmax= 1.0, cmap= 'coolwarm', annot=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered,y_cancer, test_size=0.3,
                                                   random_state=42)

clf = RandomForestClassifier(random_state=43)
clr = clf.fit(X_train, y_train)

score = accuracy_score(y_test, clr.predict(X_test))
print("accuracy score:", score)

mat = confusion_matrix(y_test, clr.predict(X_test))
sns.heatmap(mat, annot=True)


In [None]:
from sklearn.feature_selection import SelectKBest, chi2

bestKFeatures = SelectKBest(chi2, k=5).fit(X_train, y_train)

print('Score list:', bestKFeatures.scores_)
print('Feature list:', X_train.columns)

In [None]:
score_frame = pd.DataFrame(bestKFeatures.scores_, X_train.columns)
score_frame

In [None]:
X_train_2 = bestKFeatures.transform(X_train)
X_test_2 = bestKFeatures.transform(X_test)

clf_2 = RandomForestClassifier()
clr_2= clf_2.fit(X_train_2, y_train)

score = accuracy_score(y_test, clr_2.predict(X_test_2))
print("Score = ", score)

mat = confusion_matrix(y_test, clr_2.predict(X_test_2))
sns.heatmap(mat, annot=True)

**RFE**

In [None]:
from sklearn.feature_selection import RFE

clf_3 = RandomForestClassifier()
rfe = RFE(clf_3, n_features_to_select = 5, step = 1)
rfe = rfe.fit(X_train, y_train)

print(X_train.columns[rfe.support_])

In [None]:
X_train_4 = rfe.transform(X_train)
X_test_4 = rfe.transform(X_test)

clf_4 = RandomForestClassifier()
clr_4= clf_4.fit(X_train_4, y_train)

score = accuracy_score(y_test, clr_4.predict(X_test_4))
print("Score = ", score)

mat = confusion_matrix(y_test, clr_4.predict(X_test_4))
sns.heatmap(mat, annot=True)

**RFE with CV**

In [None]:
from sklearn.feature_selection import RFECV

clf_5 = RandomForestClassifier()
refcv = RFECV(clf_5,step=1, scoring= 'accuracy', cv=2)
refcv = refcv.fit(X_train,y_train)

print("Number of features: " , refcv.n_features_)
print("best features: ", X_train.columns[refcv.support_])

In [None]:
plt.figure()

plt.xlabel("No of features")
plt.ylabel("CV Score")
plt.plot(range(1, len(refcv.grid_scores_) + 1), refcv.grid_scores_)
plt.show()


In [None]:
clf_6 = RandomForestClassifier()      
clr_6 = clf_6.fit(X_train,y_train)
importances = clr_6.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf_6.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest

plt.figure(1, figsize=(14, 13))
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="g", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices],rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

In [None]:
from sklearn.decomposition import PCA

X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)

pca = PCA()
pca.fit(X_train_norm)
cumsum = np.cumsum(pca.explained_variance_ratio_)
features_in_order = np.argsort(pca.explained_variance_ratio_)


plt.figure(figsize=(10,8))
plt.grid()
plt.plot(cumsum, linewidth=2)
plt.axhline(y=0.95, color='r')



In [None]:
X_reduced = pca.fit_transform(X_cancer)

In [None]:

X_train_6 = pca.transform(X_train)
X_test_6 = pca.transform(X_test)

clf_6 = RandomForestClassifier()
clr_6= clf_6.fit(X_train_6, y_train)

score = accuracy_score(y_test, clr_6.predict(X_test_6))
print("Score = ", score)

mat = confusion_matrix(y_test, clr_6.predict(X_test_6))
sns.heatmap(mat, annot=True)

<font size = "5"> SVM </font>

In [None]:
features = ['area_mean', 'area_se', 'radius_worst', 'texture_mean', 'concavity_mean',
              'concave points_worst']
X = X_cancer[['area_mean', 'area_se', 'radius_worst', 'texture_mean', 'concavity_mean',
              'concave points_worst']]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_cancer, test_size=0.3)

In [None]:
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

clf = SVC()
#param_list = ['C' : range(0,5,0.1), 'gamma':[1,0,0.1,0.01, 0.001],
            #  'kernel':['rbf', 'poly','linear','sigmoid']]
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(clf, param_grid)
grid.fit(X_train_norm, y_train)
print(grid.best_estimator_)

In [None]:
from sklearn.svm import SVC


svm = SVC(C=10, probability=True, gamma=0.01)
svm.fit(X_train_norm ,y_train)

print("train score: ", svm.score(X_train_norm,y_train))
print("test score: ", svm.score(X_test_norm,y_test))

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

y_scores = svm.predict_proba(X_test_norm)

fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1], pos_label = 'M')
plt.plot(fpr, tpr)
plt.show()

auc_score = roc_auc_score(y_test, y_scores[:,1])
print("auc score: ", auc_score)

<font size = "5"> Decision Tree Classifier </font>

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
param_grid = {'max_depth' : [1,2,3,4,5,6,7,8,9],
              'max_leaf_nodes' : list(range(2,100,1)) }

grid = GridSearchCV(clf, param_grid)
grid.fit(X_train_norm, y_train)
print(grid.best_estimator_)

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier(max_depth=4, max_leaf_nodes = 13)
clf_dt.fit(X_train_norm, y_train)

print("Train score = ", clf_dt.score(X_train_norm, y_train))
print("Test score = ", clf_dt.score(X_test_norm, y_test))

In [None]:
y_scores = clf_dt.predict_proba(X_test_norm)

fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1], pos_label = 'M')
plt.plot(fpr, tpr)
plt.grid()
plt.show()

auc_score = roc_auc_score(y_test, y_scores[:,1])
print("auc score: ", auc_score)

<font size = "5"> Logistic Regression </font>

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(penalty="l2")
clf_lr.fit(X_train_norm, y_train)

In [None]:
print("train score: " , clf_lr.score(X_train_norm, y_train))
print("test score: ", clf_lr.score(X_test_norm, y_test))

In [None]:
y_scores = clf_lr.predict_proba(X_test_norm)

fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1], pos_label = 'M')
plt.plot(fpr, tpr)
plt.grid()
plt.show()

auc_score = roc_auc_score(y_test, y_scores[:,1])
print("auc score: ", auc_score)

<font size = "5">  KNN Classifier </font>

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors' : list(range(1,20,1))}
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, param_grid)

grid.fit(X_train_norm, y_train)
print(grid.best_estimator_)


In [None]:
clf_knn = KNeighborsClassifier(n_neighbors = 8)
clf_knn.fit(X_train_norm, y_train)

print("train score: ", clf_knn.score(X_train_norm, y_train))
print("test score: ", clf_knn.score(X_test_norm, y_test))

In [None]:
y_scores = clf_knn.predict_proba(X_test_norm)

fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1], pos_label = 'M')
plt.plot(fpr, tpr)
plt.grid()
plt.show()

auc_score = roc_auc_score(y_test, y_scores[:,1])
print("auc score: ", auc_score)