In [32]:
import pandas as pd
import numpy as np

## Leitura do arquivo

In [33]:
data = pd.read_csv('breast_cancer_data.csv',sep=',')
data.head()

Unnamed: 0,1000025,5.0,1.0,1,1.1,2,1.2,3.0,1.0.1,1.3,benign,Dr. Doe
0,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
1,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
2,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
3,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
4,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith


In [34]:
# para visualizar o formato do dataframe: linhas x colunas
print('formato: ', data.shape, '\n')

formato:  (698, 12) 



### não temos nome de coluna, então precisamos fornecer esses nomes e passar como parâmetro na função read_csv

In [35]:
colunas = ['id','clump_thickness','cell_size_uniformity','cell_shape_uniformity','marginal_adhesion','single_ep_cell_size',
          'bare_nuclei','bland_chromatin','normal_nucleoli','mitoses','class','doctor_name']

data = pd.read_csv('breast_cancer_data.csv',names=colunas)
data.head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong


In [36]:
print("Quantidade de colunas: ",len(colunas),'\n')

# imprime o nome das colunas
print(data.columns)

print(len(data.columns))



Quantidade de colunas:  12 

Index(['id', 'clump_thickness', 'cell_size_uniformity',
       'cell_shape_uniformity', 'marginal_adhesion', 'single_ep_cell_size',
       'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class',
       'doctor_name'],
      dtype='object')
12


### vamos entender nossos dados

In [37]:
data.dtypes

id                         int64
clump_thickness          float64
cell_size_uniformity     float64
cell_shape_uniformity      int64
marginal_adhesion          int64
single_ep_cell_size        int64
bare_nuclei               object
bland_chromatin          float64
normal_nucleoli          float64
mitoses                    int64
class                     object
doctor_name               object
dtype: object

In [38]:
list(set(data.dtypes.tolist()))

[dtype('O'), dtype('int64'), dtype('float64')]

In [39]:
data_num = data.select_dtypes(include = ['float64', 'int64'])

data_num = data_num.corr()['clump_thickness'][2:] # para tirar a variável clump_thickness
data_num = data_num[abs(data_num) > 0.5].sort_values(ascending=False) #filtra valores de correlaçao < 0.5 e ordena os valores
print("Existem {} variáveis correlacionadas com clump_thickness:\n{}".format(len(data_num), data_num))

Existem 5 variáveis correlacionadas com clump_thickness:
cell_shape_uniformity    0.654632
cell_size_uniformity     0.644636
bland_chromatin          0.558924
normal_nucleoli          0.535773
single_ep_cell_size      0.522108
Name: clump_thickness, dtype: float64


In [40]:
data.describe()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bland_chromatin,normal_nucleoli,mitoses
count,699.0,698.0,698.0,699.0,699.0,699.0,695.0,698.0,699.0
mean,1071704.0,4.416905,3.137536,3.207439,2.793991,3.216023,3.447482,2.868195,1.589413
std,617095.7,2.817673,3.052575,2.971913,2.843163,2.2143,2.441191,3.055647,1.715078
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0
75%,1238298.0,6.0,5.0,5.0,3.5,4.0,5.0,4.0,1.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 12 columns):
id                       699 non-null int64
clump_thickness          698 non-null float64
cell_size_uniformity     698 non-null float64
cell_shape_uniformity    699 non-null int64
marginal_adhesion        699 non-null int64
single_ep_cell_size      699 non-null int64
bare_nuclei              697 non-null object
bland_chromatin          695 non-null float64
normal_nucleoli          698 non-null float64
mitoses                  699 non-null int64
class                    699 non-null object
doctor_name              699 non-null object
dtypes: float64(4), int64(5), object(3)
memory usage: 65.6+ KB


In [42]:
data.groupby(by =['class','doctor_name']).size()

class      doctor_name
benign     Dr. Doe        127
           Dr. Lee        121
           Dr. Smith      102
           Dr. Wong       108
malignant  Dr. Doe         58
           Dr. Lee         60
           Dr. Smith       74
           Dr. Wong        49
dtype: int64

In [43]:
print(data['class'].value_counts(),'\n')
# data['doctor_name'].value_counts() para médicos

# se quisermos os valores normalizados, precisamos adicionar um parâmetro
print('Dados normalizados: \n',data['class'].value_counts(normalize=True))

benign       458
malignant    241
Name: class, dtype: int64 

Dados normalizados: 
 benign       0.655222
malignant    0.344778
Name: class, dtype: float64


In [44]:
# ordenando apenas por uma coluna
data.sort_values(by='cell_size_uniformity',ascending=False).head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
106,1170419,10.0,10.0,10,8,2,10,4.0,1.0,1,malignant,Dr. Smith
44,1103608,10.0,10.0,10,4,8,1,8.0,10.0,1,malignant,Dr. Doe
669,1350423,5.0,10.0,10,8,5,5,7.0,10.0,1,malignant,Dr. Smith
104,1168736,10.0,10.0,10,10,10,1,8.0,8.0,8,malignant,Dr. Wong
266,1198641,10.0,10.0,6,3,3,10,4.0,3.0,2,malignant,Dr. Smith


In [45]:
# ordenando por múltiplas colunas
data.sort_values(by=['cell_size_uniformity','bare_nuclei'],ascending=[False,True]).head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
36,1080185,10.0,10.0,10,8,6,1,8.0,9.0,1,malignant,Dr. Doe
44,1103608,10.0,10.0,10,4,8,1,8.0,10.0,1,malignant,Dr. Doe
104,1168736,10.0,10.0,10,10,10,1,8.0,8.0,8,malignant,Dr. Wong
177,1201936,5.0,10.0,10,3,8,1,5.0,10.0,3,malignant,Dr. Lee
436,1295186,10.0,10.0,10,1,6,1,2.0,8.0,1,malignant,Dr. Doe


In [46]:
# média
data.clump_thickness.mean()

4.416905444126074

In [47]:
# benigno
print('Tamanho - benigno: ',data[data['class']=='benign']['cell_size_uniformity'].mean(),'\n')
print('Tamanho - maligno: ',data[data['class']=='malignant']['cell_size_uniformity'].mean(),'\n')
print('Proporcao Maligno X Benigno: ',data[data['class']=='malignant']['cell_size_uniformity'].mean() / data[data['class']=='benign']['cell_size_uniformity'].mean())

Tamanho - benigno:  1.3260393873085339 

Tamanho - maligno:  6.572614107883817 

Proporcao Maligno X Benigno:  4.95657532558235


In [48]:
data[(data['class']=='benign') & (data['doctor_name']=='Dr. Wong')]['mitoses'].max()

3

In [49]:
# Analisando as 10 primeiras linhas das colunas 2 a 5
data.loc[0:9,'cell_size_uniformity':'single_ep_cell_size']
#data.iloc[0:9,2:6] # usando iloc, podemos passar apenas os índices

Unnamed: 0,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size
0,1.0,1,1,2
1,4.0,4,5,7
2,1.0,1,1,2
3,8.0,8,1,3
4,1.0,1,3,2
5,10.0,10,8,7
6,,1,1,2
7,1.0,2,1,2
8,1.0,1,1,2
9,2.0,1,1,2


In [50]:
# acessando a última linha do dataframe
data[-1:]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
698,897471,4.0,8.0,8,5,4,5,10.0,4.0,1,malignant,Dr. Wong


### Lidando com valores faltantes

In [51]:
data.isna().sum()

id                       0
clump_thickness          1
cell_size_uniformity     1
cell_shape_uniformity    0
marginal_adhesion        0
single_ep_cell_size      0
bare_nuclei              2
bland_chromatin          4
normal_nucleoli          1
mitoses                  0
class                    0
doctor_name              0
dtype: int64

In [52]:
data = data.dropna(axis = 0, how = 'any')
len(data)

690

### Lidando com valores duplicados

In [53]:
data[data.duplicated()]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
258,1198641,3.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Lee


In [54]:
data = data.drop_duplicates()

In [55]:
data[data.duplicated()]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name


## Aplicando funções 

In [56]:
# encontrando o valor máximo em cada coluna
data.apply(np.max)

id                        13454352
clump_thickness                 10
cell_size_uniformity            10
cell_shape_uniformity           10
marginal_adhesion               10
single_ep_cell_size             10
bare_nuclei                      ?
bland_chromatin                 10
normal_nucleoli                 10
mitoses                         10
class                    malignant
doctor_name               Dr. Wong
dtype: object

In [57]:
data['bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [58]:
data['bare_nuclei'] = data['bare_nuclei'].replace('?','6').astype('int64')

In [59]:
#executando de novo para testar
data.apply(np.max)

id                        13454352
clump_thickness                 10
cell_size_uniformity            10
cell_shape_uniformity           10
marginal_adhesion               10
single_ep_cell_size             10
bare_nuclei                     10
bland_chromatin                 10
normal_nucleoli                 10
mitoses                         10
class                    malignant
doctor_name               Dr. Wong
dtype: object

### Funções de agregação

In [63]:
colunas = ['cell_size_uniformity','cell_shape_uniformity','mitoses']
data.groupby(['class'])[colunas].agg([np.mean, np.min, np.max])

Unnamed: 0_level_0,cell_size_uniformity,cell_size_uniformity,cell_size_uniformity,cell_shape_uniformity,cell_shape_uniformity,cell_shape_uniformity,mitoses,mitoses,mitoses
Unnamed: 0_level_1,mean,amin,amax,mean,amin,amax,mean,amin,amax
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
benign,1.330377,1.0,9.0,1.450111,1,8,1.064302,1,8
malignant,6.605042,1.0,10.0,6.592437,1,10,2.592437,1,10


### Filtrando dados

In [60]:
repeat_patients = data.groupby(by = 'id').size().sort_values(ascending =False)
repeat_patients

id
1182404    6
1276091    5
1100524    2
1293439    2
654546     2
1114570    2
1299596    2
1299924    2
1115293    2
1116116    2
1116192    2
560680     2
1320077    2
493452     2
1321942    2
466906     2
1143978    2
1105524    2
1168736    2
411453     2
695091     2
704097     2
733639     2
734111     2
1070935    2
1212422    2
769612     2
1061990    2
798429     2
1218860    2
          ..
1205579    1
1205138    1
1204898    1
1204558    1
1204242    1
1203096    1
1202812    1
1202253    1
1202125    1
1201936    1
1208301    1
1211202    1
1217952    1
1211265    1
1217717    1
1217264    1
1217051    1
1216947    1
1216694    1
1214966    1
1214556    1
1214092    1
1213784    1
1213383    1
1213375    1
1213273    1
1212251    1
1212232    1
1211594    1
61634      1
Length: 637, dtype: int64

In [61]:
filtered_patients = repeat_patients[repeat_patients > 2].to_frame().reset_index()
# ~ Apareceram até 2 X
filtered_data = data[~data.id.isin(filtered_patients.id)]

In [62]:
filtered_data

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
5,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith
7,1018561,2.0,1.0,2,1,2,1,3.0,1.0,1,benign,Dr. Smith
8,1033078,2.0,1.0,1,1,2,1,1.0,1.0,5,benign,Dr. Smith
9,1033078,4.0,2.0,1,1,2,1,2.0,1.0,1,benign,Dr. Doe
10,1035283,1.0,1.0,1,1,1,1,3.0,1.0,1,benign,Dr. Doe
