In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([
    [42, 'male', 12, 'reading', 'class2'],
    [35, 'unknown', 3, 'cooking', 'class1'],
    [1000, 'female', 7, 'cycling', 'class3'],
    [1000, 'unknown', 21, 'unknown', 'unknown']
])

df.columns = ['age', 'gender', 'month_birth', 'hobby', 'target']

In [3]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42,male,12,reading,class2
1,35,unknown,3,cooking,class1
2,1000,female,7,cycling,class3
3,1000,unknown,21,unknown,unknown


In [4]:
df['age'].unique() # get all vales it has without duplicated data

array([  42,   35, 1000])

In [5]:
df['gender'].unique()

array(['male', 'unknown', 'female'], dtype=object)

In [6]:
df['month_birth'].unique()

array([12,  3,  7, 21])

In [7]:
df['hobby'].unique()

array(['reading', 'cooking', 'cycling', 'unknown'], dtype=object)

In [8]:
df['target'].unique()

array(['class2', 'class1', 'class3', 'unknown'], dtype=object)

In [9]:
df.loc[df['age'] > 150, ['age']] = np.nan
df.loc[df['gender']=='unknown', ['gender']] = np.nan
df.loc[df['month_birth'] > 12, ['month_birth']] = np.nan # it treat as missing value
df.loc[df['hobby'] == 'unknown', ['hobby']] = np.nan
df.loc[df['target'] == 'unknown', ['target']] = np.nan

In [10]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3
3,,,,,


In [11]:
df.isnull().sum() # returning number of missing values

age            2
gender         2
month_birth    1
hobby          1
target         1
dtype: int64

In [12]:
df2 = df.dropna(axis=0) # drop row(axis = 0) if it has missing value

In [13]:
df2

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2


In [14]:
df3 = df.dropna(axis=1)
df3

0
1
2
3


In [15]:
df4 = df.dropna(how='all') # drop data if it only has missing value without proper value
df4

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [16]:
df5 = df.dropna(thresh=2) # drop dta if it has missing value more than 2(thresholds)

In [17]:
df5

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [20]:
alter_values = { 'age': 0, 'gender': 'U', 'month_birth': 0, 'hobby' : 'U', 'target': 'class4' }
df7 = df.fillna(value= alter_values) # fill na values in with alter_values respectively

In [21]:
df7

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [22]:
from sklearn.preprocessing import LabelEncoder

df8 = df7
class_label = LabelEncoder()
data_value = df8['target'].values
y_new = class_label.fit_transform(data_value)
y_new

array([1, 0, 2, 3])

In [25]:
df8['target'] = y_new
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,1
1,35.0,U,3.0,cooking,0
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [26]:
y_ori = class_label.inverse_transform(y_new)
y_ori

array(['class2', 'class1', 'class3', 'class4'], dtype=object)

In [27]:
df8['target'] = y_ori
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [28]:
df9 = df8
df9['target'] = df9['target'].astype(str)
df10 = pd.get_dummies(df9['target'])
print(df10)

   class1  class2  class3  class4
0       0       1       0       0
1       1       0       0       0
2       0       0       1       0
3       0       0       0       1
