In [37]:
import pandas as pd
from io import StringIO
import numpy as np

In [15]:
csv_data = \
  '''A,B,C,D
   1,2,3,4
   5,6,,8
   10,11,12
'''

In [16]:
csv_data

'A,B,C,D\n   1,2,3,4\n   5,6,,8\n   10,11,12\n'

In [17]:
df = pd.read_csv(StringIO(csv_data)) # StringIO reads the string assigned to csv_data into a pandas DataFrame as if it was a regular csv file on our hard drive
df

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [18]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [19]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [20]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [21]:
df.dropna(axis=0) #axis=0 drop rows

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0


In [22]:
df.dropna(axis=1) #axis=1 drop columns

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


In [23]:
# drop rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [25]:
# drop rows that have less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0


In [26]:
# drop rows where NaN appear in specific columns like C
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
2,10,11,12.0,


In [29]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) #calculate column means
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data



array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [30]:
imr2 = Imputer(missing_values='NaN', strategy='mean', axis=1) #calculate row means
imr2 = imr2.fit(df.values)
imputed_data2 = imr2.transform(df.values)
imputed_data2



array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  6.33333333,  8.        ],
       [10.        , 11.        , 12.        , 11.        ]])

you can also impute missing values with *median* or *most_frequent*.

Imputer belongs to the **transformer** classes in scikit-learn.
**fit** is used to learn the parameters from the training data, and
**transform** is used to use the parameters to transform data.
Any data array that is to be transformed needs to have the same number of features as the data array that was used to fit the model.

# nominal and ordinal features

In [32]:
df = pd.DataFrame([
    ['green','M', 10.1,'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']
])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [33]:
# mapping ordinal features

size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [34]:
inv_size_mapping = {
    v:k for k,v in size_mapping.items()
}
df['size']=df['size'].map(inv_size_mappingsize_mapping)

In [35]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [39]:
# for nominal mapping, no orders, just enumerate the class labels
class_mapping = {
    label:idx for idx,label in enumerate(np.unique(df['classlabel']))
}
class_mapping

{'class1': 0, 'class2': 1}

In [40]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [41]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0


In [43]:
inv_class_mapping = {
    v:k for k,v in class_mapping.items()
}

df['classlabel']=df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [44]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values) #fit_transform() is a shortcut for fit and transform
y

array([0, 1, 0])

In [45]:
class_le.inverse_transform(y) #inverse_transform() transform the integer back to original string

array(['class1', 'class2', 'class1'], dtype=object)

In [46]:
X = df[['color','size','price']].values
X

array([['green', 'M', 10.1],
       ['red', 'L', 13.5],
       ['blue', 'XL', 15.3]], dtype=object)

In [47]:
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:,0])
X

array([[1, 'M', 10.1],
       [2, 'L', 13.5],
       [0, 'XL', 15.3]], dtype=object)

In [55]:
size_le = LabelEncoder()
X[:,1] = size_le.fit_transform(X[:,1])
X

array([[1, 1, 10.1],
       [2, 0, 13.5],
       [0, 2, 15.3]], dtype=object)

In [56]:
# not using numbers as above, but use dummy features instead
#one-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0]) #[0] means it selects the first column to do ohe
ohe.fit_transform(X).toarray() #convert the sparse matrix into array

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  0. , 13.5],
       [ 1. ,  0. ,  0. ,  2. , 15.3]])

In [57]:
ohe = OneHotEncoder(categorical_features=[0], sparse=False)
ohe.fit_transform(X) #sparse=False enables you to omit the .toarray() function

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  0. , 13.5],
       [ 1. ,  0. ,  0. ,  2. , 15.3]])

In [58]:
# applied to DataFrame, the get_dummies will only convert string columns and leave all other columns unchanged.
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [59]:
# the dummies are collinear, so you need to drop one feature column, and it won't affect anything
pd.get_dummies(df[['price', 'color', 'size']],
               drop_first=True)

Unnamed: 0,price,color_green,color_red,size_M,size_XL
0,10.1,1,0,1,0
1,13.5,0,1,0,0
2,15.3,0,0,0,1


In [60]:
#you can achieve the same thing in OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()[:, 1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  0. , 13.5],
       [ 0. ,  0. ,  2. , 15.3]])