# BREAST_CANCER_CLASSIFICATION

This notebook mainly talks about the data classification of the breast cancer data 

# 1 - IMPORTS FOR NOTEBOOK

In [277]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# 2 - LOAD BREAST CANCER DATA 

In [278]:
from sklearn.datasets import load_breast_cancer
breast_cancer_data = load_breast_cancer()

# 3 - CONVERT TO PANDAS DATAFRAME

### 3.2 - DATA OVERVIEW
<p>Attribute Information:</p>
<ol>
    <li><b>RADIUS</b>      &emsp;(mean of distances from center to points on the perimeter)</li>
    <li><b>TEXTURE</b>        &emsp;(standard deviation of gray-scale values)</li>
    <li><b>PERIMETER</b>     &emsp;proportion of non-retail business acres per town</li>
    <li><b>AREA</b>      &emsp;Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)</li>
    <li><b>SMOOTHNESS</b>       &emsp;(local variation in radius lengths)</li>
    <li><b>COMPACTNESS</b>        &emsp;(perimeter^2 / area - 1.0)</li>
    <li><b>CONCAVITY</b>       &emsp;(severity of concave portions of the contour)</li>
    <li><b>CONCAVE POINTS</b>       &emsp;(number of concave portions of the contour)</li>
    <li><b>SYMMETRY</b>       &emsp;index of accessibility to radial highways</li>
    <li><b>FRACTAL DIMENSION</b>       &emsp;(“coastline approximation” - 1)</li>
</ol>

# READING THE DATA 

In [279]:
patient_data = pd.read_csv('wdbc.data', names=['PID','CLASS','RADI-M','TEXT-M','PERI-M','AREA-M','SMOO-M','COMP-M','CONC-M','CONP-M','SYMM-M','FDIM-M','RADI-SE','TEXT-SE','PERI-SE','AREA-SE','SMOO-SE','COMP-SE','CONC-SE','CONP-SE','SYMM-SE','FDIM-SE','RADI-MX','TEXT-MX','PERI-MX','AREA-MX','SMOO-MX','COMP-MX','CONC-MX','CONP-MX','SYMM-MX','FDIM-MX'])

In [280]:
patient_data

Unnamed: 0,PID,CLASS,RADI-M,TEXT-M,PERI-M,AREA-M,SMOO-M,COMP-M,CONC-M,CONP-M,...,RADI-MX,TEXT-MX,PERI-MX,AREA-MX,SMOO-MX,COMP-MX,CONC-MX,CONP-MX,SYMM-MX,FDIM-MX
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


# PATIENT ID IS NOT NEEDED

In [281]:
del patient_data['PID']

In [282]:
patient_data

Unnamed: 0,CLASS,RADI-M,TEXT-M,PERI-M,AREA-M,SMOO-M,COMP-M,CONC-M,CONP-M,SYMM-M,...,RADI-MX,TEXT-MX,PERI-MX,AREA-MX,SMOO-MX,COMP-MX,CONC-MX,CONP-MX,SYMM-MX,FDIM-MX
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


# LABEL ENCODING

In [283]:
patient_data["CLASS"] = patient_data["CLASS"].astype('category')
patient_data["CLASS_CAT"] = patient_data["CLASS"].cat.codes

In [284]:
patient_data

Unnamed: 0,CLASS,RADI-M,TEXT-M,PERI-M,AREA-M,SMOO-M,COMP-M,CONC-M,CONP-M,SYMM-M,...,TEXT-MX,PERI-MX,AREA-MX,SMOO-MX,COMP-MX,CONC-MX,CONP-MX,SYMM-MX,FDIM-MX,CLASS_CAT
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,1
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,1
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,1
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,1
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,1
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,1
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,1
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,1


In [285]:
del patient_data['CLASS']

In [286]:
patient_data

Unnamed: 0,RADI-M,TEXT-M,PERI-M,AREA-M,SMOO-M,COMP-M,CONC-M,CONP-M,SYMM-M,FDIM-M,...,TEXT-MX,PERI-MX,AREA-MX,SMOO-MX,COMP-MX,CONC-MX,CONP-MX,SYMM-MX,FDIM-MX,CLASS_CAT
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,1
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,1
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,1
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,1
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,1
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,1
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,1


In [287]:
patient_data.columns

Index(['RADI-M', 'TEXT-M', 'PERI-M', 'AREA-M', 'SMOO-M', 'COMP-M', 'CONC-M',
       'CONP-M', 'SYMM-M', 'FDIM-M', 'RADI-SE', 'TEXT-SE', 'PERI-SE',
       'AREA-SE', 'SMOO-SE', 'COMP-SE', 'CONC-SE', 'CONP-SE', 'SYMM-SE',
       'FDIM-SE', 'RADI-MX', 'TEXT-MX', 'PERI-MX', 'AREA-MX', 'SMOO-MX',
       'COMP-MX', 'CONC-MX', 'CONP-MX', 'SYMM-MX', 'FDIM-MX', 'CLASS_CAT'],
      dtype='object')

In [288]:
cancer_target = patient_data.iloc[:,30:31]

In [289]:
cancer_target

Unnamed: 0,CLASS_CAT
0,1
1,1
2,1
3,1
4,1
...,...
564,1
565,1
566,1
567,1


In [290]:
del patient_data['CLASS_CAT']

In [291]:
patient_data

Unnamed: 0,RADI-M,TEXT-M,PERI-M,AREA-M,SMOO-M,COMP-M,CONC-M,CONP-M,SYMM-M,FDIM-M,...,RADI-MX,TEXT-MX,PERI-MX,AREA-MX,SMOO-MX,COMP-MX,CONC-MX,CONP-MX,SYMM-MX,FDIM-MX
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [292]:
patient_data.describe()

Unnamed: 0,RADI-M,TEXT-M,PERI-M,AREA-M,SMOO-M,COMP-M,CONC-M,CONP-M,SYMM-M,FDIM-M,...,RADI-MX,TEXT-MX,PERI-MX,AREA-MX,SMOO-MX,COMP-MX,CONC-MX,CONP-MX,SYMM-MX,FDIM-MX
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


# CHECK FOR NA WITH COUNT

Above it shows that all of them are 569 values so we do not have any nan values 

# TRAIN TEST SPLIT

In [293]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(patient_data, cancer_target, test_size=0.33, random_state=42)

In [294]:
train_x.shape

(381, 30)

# CLASSIFICATION METRICS 

In [295]:
metrics_list={}

In [296]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def mertics_function(test_y, y_pred,model_name,metrics_list,comments):
    metrics_list['model_name'] = model_name
    metrics_list['accuracy_score'] = accuracy_score(test_y, y_pred)
    metrics_list['balanced_accuracy_score'] = balanced_accuracy_score(test_y, y_pred)
    metrics_list['average_precision_score'] = average_precision_score(test_y, y_pred)
    metrics_list['brier_score_loss'] = brier_score_loss(test_y, y_pred)
    metrics_list['f1_score'] = f1_score(test_y, y_pred)
    metrics_list['precision_score'] = precision_score(test_y, y_pred)
    metrics_list['recall_score']=recall_score(test_y, y_pred)
    metrics_list['comments']=comments

# LOGISTIC REGRESSION

In [297]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_x, train_y)
y_pred = clf.predict(test_x)
mertics_function(test_y,y_pred,"Logistic Regression",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = pd.DataFrame([metrics_list])

  y = column_or_1d(y, warn=True)


# SVM

In [298]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(train_x, train_y)
y_pred = svm_clf.predict(test_x)
mertics_function(test_y,y_pred,"SVM",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# SGD

In [299]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
sgd_clf.fit(train_x, train_y)
y_pred = sgd_clf.predict(test_x)
mertics_function(test_y,y_pred,"SGD",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# DECISION TREE CLASSIFIER

In [300]:
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = clf.fit(train_x, train_y)
y_pred = tree_clf.predict(test_x)
mertics_function(test_y,y_pred,"Decision Tree",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# RANDOM FOREST

In [301]:
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier(n_estimators=10)
rand_clf = clf.fit(train_x, train_y)
y_pred = rand_clf.predict(test_x)
mertics_function(test_y,y_pred,"Random Forest",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# GRADIENT BOOST

In [302]:
from sklearn.ensemble import GradientBoostingClassifier
grad_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(train_x, train_y)
y_pred = grad_clf.predict(test_x)
mertics_function(test_y,y_pred,"Gradient Boost",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


# ADA BOOST

In [303]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators=100).fit(train_x, train_y)
y_pred = ada_clf.predict(test_x)
mertics_function(test_y,y_pred,"Ada Boost",metrics_list,"Straight data without any change")
# update the dataframe with the metrics 
metrics_pd = metrics_pd.append([metrics_list])

  y = column_or_1d(y, warn=True)


In [304]:
metrics_pd

Unnamed: 0,model_name,accuracy_score,balanced_accuracy_score,average_precision_score,brier_score_loss,f1_score,precision_score,recall_score,comments
0,Logistic Regression,0.957447,0.95362,0.905438,0.042553,0.940299,0.940299,0.940299,Straight data without any change
0,SVM,0.643617,0.5,0.356383,0.356383,0.0,0.0,0.0,Straight data without any change
0,SGD,0.388298,0.524793,0.368132,0.611702,0.538153,0.368132,1.0,Straight data without any change
0,Decision Tree,0.957447,0.95362,0.905438,0.042553,0.940299,0.940299,0.940299,Straight data without any change
0,Random Forest,0.957447,0.95362,0.905438,0.042553,0.940299,0.940299,0.940299,Straight data without any change
0,Gradient Boost,0.957447,0.963612,0.895935,0.042553,0.942857,0.90411,0.985075,Straight data without any change
0,Ada Boost,0.957447,0.956951,0.901962,0.042553,0.941176,0.927536,0.955224,Straight data without any change


# RESULT VISUALIZATION

In [305]:
test_y

Unnamed: 0,CLASS_CAT
204,0
70,1
131,1
431,0
540,0
...,...
141,1
498,1
7,1
541,0


In [306]:
test_y.reset_index(inplace = True)
test_y

Unnamed: 0,index,CLASS_CAT
0,204,0
1,70,1
2,131,1
3,431,0
4,540,0
...,...,...
183,141,1
184,498,1
185,7,1
186,541,0


In [307]:
del test_y['index']

In [308]:
test_y

Unnamed: 0,CLASS_CAT
0,0
1,1
2,1
3,0
4,0
...,...
183,1
184,1
185,1
186,0


In [309]:
test_y['y_pred'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [310]:
test_y

Unnamed: 0,CLASS_CAT,y_pred
0,0,0
1,1,1
2,1,1
3,0,0
4,0,0
...,...,...
183,1,1
184,1,1
185,1,1
186,0,0


In [311]:
test_y['index_1'] = test_y.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [312]:
test_y

Unnamed: 0,CLASS_CAT,y_pred,index_1
0,0,0,0
1,1,1,1
2,1,1,2
3,0,0,3
4,0,0,4
...,...,...,...
183,1,1,183
184,1,1,184
185,1,1,185
186,0,0,186


In [313]:
y_pred

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0], dtype=int8)

In [314]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, y_pred)

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and binary targets

In [None]:
plt.scatter(test_y['index_1'],test_y['CLASS_CAT'])