In [1]:
import pandas as pd


In [2]:
import pandas as pd
from io import StringIO
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
# If you are using Python 2.7, you need
# to convert the string to unicode:
# csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


# 1. dealing with missing data

## 1.1 deleting training examples or features with missing values

delete examples

In [3]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


delete features

In [4]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


## 1.2 Imputing missing values - interpolation techniques.

### sklearn impute.SimpleImputer

In [5]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [6]:
from sklearn.impute import SimpleImputer
import numpy as np

In [7]:
si = SimpleImputer(missing_values=np.nan, strategy='mean')

In [9]:
imputed_data = si.fit_transform(df)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

### pandas' fillna method

In [11]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [10]:
df.mean()

A    5.333333
B    6.333333
C    7.500000
D    6.000000
dtype: float64

In [12]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


### knn imputer

In [13]:
from sklearn.impute import KNNImputer

In [26]:
knn_im = KNNImputer(missing_values=np.nan, n_neighbors=1)


In [27]:
knn_im.fit(df)

In [28]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [29]:
knn_im.transform(df)

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  3.,  8.],
       [10., 11., 12.,  8.]])

# 2. handling categorical data

In [30]:
import pandas as pd
df = pd.DataFrame([
['green', 'M', 10.1, 'class2'],
['red', 'L', 13.5, 'class1'],
['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


## 2.1 Mapping ordinal features

In [60]:
size_mapping = {'XL' : 3,
                'L' : 2,
                'M' : 1}

In [61]:
df['size'] = df['size'].map(size_mapping)

In [62]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


back to the original string representation

In [37]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
inv_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

In [38]:
df['size'] = df['size'].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


### Optinal: encoding ordinal features

In [81]:
df = pd.DataFrame([['green', 'M', 10.1,
'class2'],
['red', 'L', 13.5,
'class1'],
['blue', 'XL', 15.3,
'class2']])

In [83]:
df.columns = ['color', 'size', 'price', 'label']
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [84]:
df['x > M'] = df['size'].apply(
    lambda x: 1 if x in {'L', 'XL'} else 0
)
df['x > L'] = df['size'].apply(
    lambda x: 1 if x == 'XL' else 0
)

In [85]:
df

Unnamed: 0,color,size,price,label,x > M,x > L
0,green,M,10.1,class2,0,0
1,red,L,13.5,class1,1,0
2,blue,XL,15.3,class2,1,1


## 2.2 encoding class lables

In [39]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [40]:
df.classlabel.unique()

array(['class2', 'class1'], dtype=object)

In [45]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}

- np.unique sắp xếp từ bé đến lớn
- series.unique: lớn -> bé

In [46]:
class_mapping

{'class1': 0, 'class2': 1}

In [48]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,0
2,blue,XL,15.3,1


- back to the label

In [50]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}

In [51]:
inv_class_mapping

{0: 'class1', 1: 'class2'}

In [52]:
df['classlabel'].map(inv_class_mapping)

0    class2
1    class1
2    class2
Name: classlabel, dtype: object

## 2.3 Perfoming one-hot encoding on nominal features

In [53]:
from sklearn.preprocessing import OneHotEncoder

In [75]:
ohe = OneHotEncoder(drop='first')

In [63]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [67]:
X = df[['color', 'size', 'price']].values
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [76]:
ohe.fit(X[:, 0].reshape(-1, 1))

In [77]:
ohe.transform(X[:, 0].reshape(-1, 1)).toarray()

array([[1., 0.],
       [0., 1.],
       [0., 0.]])

In [78]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [79]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first')
X = df[['color', 'size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot', ohe, [0]),
    ('nothing', 'passthrough', [1, 2])
])

In [80]:
c_transf.fit_transform(X)

array([[1.0, 0.0, 1, 10.1],
       [0.0, 1.0, 2, 13.5],
       [0.0, 0.0, 3, 15.3]], dtype=object)