In [7]:
import numpy as np
import pandas as pd
import sys

In [19]:
import collections

In [4]:
import matplotlib
matplotlib.use('QT4Agg')

In [5]:
from matplotlib import pyplot as plt

# Import the data:

In [116]:
data_dir = r'C:\Users\Simas\Desktop\Insight\Data Challenges\Breast-Cancer-Challenge\data\breast-cancer.txt'

In [117]:
df = pd.read_csv(data_dir)

In [118]:
df.head()

Unnamed: 0,Index,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0,1241035,7,8,3,7,4,5,7,8,2,4
1,1,1107684,6,10,5,5,4,10,6,10,1,4
2,2,691628,8,6,4,10,10,1,3,5,1,4
3,3,1226612,7,5,6,3,3,8,7,4,1,4
4,4,1142706,5,10,10,10,6,10,6,5,2,4


In [119]:
len(df)

15855

## Force all the data to be in range 1 - 10, otherwise set it to 0

In [120]:
# Get rid of all misslabeled classes:
set(df['Class'])

{'#', '2', '20', '4', '40', '?', 'No idea', nan}

In [121]:
index = [i[0] or i[1] for i in zip(np.array(df['Class'] == '2'), np.array(df['Class'] == '4'))]

In [122]:
df = df[index]

In [123]:
set(df['Class'])

{'2', '4'}

In [124]:
df.head()

Unnamed: 0,Index,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0,1241035,7,8,3,7,4,5,7,8,2,4
1,1,1107684,6,10,5,5,4,10,6,10,1,4
2,2,691628,8,6,4,10,10,1,3,5,1,4
3,3,1226612,7,5,6,3,3,8,7,4,1,4
4,4,1142706,5,10,10,10,6,10,6,5,2,4


In [125]:
len(df)

15620

In [126]:
# Set class labels to 0, 1. 2->0: benign, 4->1: malignant
df['Class'] = df['Class'].apply(lambda x: 0 if x == '2' else 1)

In [127]:
df.head()

Unnamed: 0,Index,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0,1241035,7,8,3,7,4,5,7,8,2,1
1,1,1107684,6,10,5,5,4,10,6,10,1,1
2,2,691628,8,6,4,10,10,1,3,5,1,1
3,3,1226612,7,5,6,3,3,8,7,4,1,1
4,4,1142706,5,10,10,10,6,10,6,5,2,1


In [128]:
# Look at class balance:
collections.Counter(df['Class'])

Counter({1: 15164, 0: 456})

There is a huge class imbalance, on the order of 30x. Need to clean the rest of the data in a way to preserve as many benign casses as possible.

Fill in missing data with the mean of the appropriate class.

In [146]:
DF = pd.DataFrame(df)

In [147]:
df = df.apply(pd.to_numeric, errors='coerce')
df.head()

Unnamed: 0,Index,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0,1241035,7,8,3,7,4,5.0,7,8,2,1
1,1,1107684,6,10,5,5,4,10.0,6,10,1,1
2,2,691628,8,6,4,10,10,1.0,3,5,1,1
3,3,1226612,7,5,6,3,3,8.0,7,4,1,1
4,4,1142706,5,10,10,10,6,10.0,6,5,2,1


In [153]:
for i in DF.columns:
    if i == 'Index' or i == 'ID' or i == 'Class':
        continue
    print(i, set(DF[i]))

Clump Thickness {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Uniformity of Cell Size {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Uniformity of Cell Shape {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Marginal Adhesion {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Single Epithelial Cell Size {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Bare Nuclei {nan, 1.0, 2.0, 3.0, nan, 5.0, 6.0, nan, 8.0, 9.0, 10.0, 7.0, nan, 4.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}
Bland Chromatin {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Normal Nucleoli {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Mitoses {1, 2, 3, 4, 5, 6, 7, 8, 10}


We see that there are a whole bunch of nans, could just focus on bare nuclei to clean up, but run cleaning over all to be safe

In [149]:
def clean_col(df, name):
    """
    Cleans a column by setting all bad data to the mean of good data for the proper class
    
    Input: 
    str(name) column name
    """
    index = [i[0] and i[1] for i in zip(np.array(df[name] > 0), np.array(df[name] <= 10))]
    means = df[index].groupby('Class').mean()[name]
    mean_0 = means[0]
    mean_1 = means[1]
    
    class_0 = df['Class'] == 0
    index_0 = [not i[0] and i[1] for i in zip(index, class_0)]
    df[name][index_0] = mean_0
    
    index_1 = [not i[0] and not i[1] for i in zip(index, class_0)]
    df[name][index_1] = mean_1

In [152]:
for i in df.columns:
    if i == 'Index' or i == 'ID' or i == 'Class':
        continue
    clean_col(df, i)
    print(i, set(df[i]))

Clump Thickness {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Uniformity of Cell Size {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Uniformity of Cell Shape {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Marginal Adhesion {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Single Epithelial Cell Size {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Bare Nuclei {1.0, 2.0, 3.0, 1.3484162895927603, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 6.615881809787627, 4.0}
Bland Chromatin {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Normal Nucleoli {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
Mitoses {1, 2, 3, 4, 5, 6, 7, 8, 10}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Now we can fit a model, first split the data into the two classes

In [165]:
df.to_pickle(CleanedData)