In [25]:
import pandas as pd
import numpy as np

## Breast cancer dataframe

Adding column names correspondigly to `breast-cancer-wisconsin.names` file

In [26]:
names = ["id_number", "clump_thickness", "uniformity_cell_size", "uniformity_cell_shape", 
         "marginal_adhesion", "epithelial_cell_size", "bare_nuclei", "bland_chromatin", 
         "normal_nucleoli", "mitoses", "class"]
breast_cancer_df = pd.read_csv("./data/breast-cancer-wisconsin.data", names=names)
breast_cancer_df.head()

Unnamed: 0,id_number,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [27]:
breast_cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id_number              699 non-null    int64 
 1   clump_thickness        699 non-null    int64 
 2   uniformity_cell_size   699 non-null    int64 
 3   uniformity_cell_shape  699 non-null    int64 
 4   marginal_adhesion      699 non-null    int64 
 5   epithelial_cell_size   699 non-null    int64 
 6   bare_nuclei            699 non-null    object
 7   bland_chromatin        699 non-null    int64 
 8   normal_nucleoli        699 non-null    int64 
 9   mitoses                699 non-null    int64 
 10  class                  699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


Since there's no need of `id_number` if we got dataframe indexes, then `id_number` column will be dropped

In [28]:
breast_cancer_df.drop(columns=['id_number'], inplace=True)

Replacing missing values in `bare_nuclei` from `'?'` to `NaN`

In [29]:
breast_cancer_df.replace('?', np.nan, inplace=True)
breast_cancer_df.head()

Unnamed: 0,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


Saving clean dataset to `.csv` file

In [30]:
breast_cancer_df.to_csv('./data/csv/breast_cancer.csv', index=False)

## ZOO dataset

In [31]:
names = ["animal", "hair", "feathers", "eggs", "milk", "airborne", "aquatic", "predator", "toothed", "backbone", "breathes", "venomous", "fins", "legs", "tail", "domestic", "catsize", "class"]
zoo_df = pd.read_csv("./data/zoo.data", names=names)
zoo_df.head()

Unnamed: 0,animal,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


As we don't need the name of specific animal (each record in this column is unique, we will predict the `type`) we will drop it

In [32]:
zoo_df.drop(columns='animal', inplace=True)

In [33]:
zoo_df.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


There are no missing values so we can save this dataframe to file

In [34]:
zoo_df.to_csv('./data/csv/zoo.csv', index=False)

## Iris dataset

In [42]:
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
iris_df = pd.read_csv("./data/iris.data", names=names)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [43]:
iris_df.info

<bound method DataFrame.info of      sepal_length  sepal_width  petal_length  petal_width           class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]>

In [44]:
# Converting numerical attributes to categorical attributes (ranges)
sepal_length = pd.cut(iris_df['sepal_length'], 5).astype(str)
sepal_width = pd.cut(iris_df['sepal_width'], 5).astype(str)
petal_length = pd.cut(iris_df['petal_length'], 5).astype(str)
petal_width = pd.cut(iris_df['petal_width'], 5).astype(str)
iris_df['sepal_length'] = sepal_length
iris_df['sepal_width'] = sepal_width
iris_df['petal_length'] = petal_length
iris_df['petal_width'] = petal_width

# Storing the clean dataset in a .csv file
iris_df.to_csv('./data/csv/iris.csv', index=False)