## Missing Values

In [1]:
import pandas as pd
import numpy as np

In [8]:
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}

df = pd.DataFrame(raw_data,columns=['first_name','last_name','age','sex','preTestScore','postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
df.isnull().sum()/len(df)

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
dtype: float64

In [10]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [11]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [17]:
df_cleaned = df.dropna(how='all')
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [18]:
df['location'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [19]:
df.dropna(axis=1,how='all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [20]:
df.dropna(axis=0,thresh=1)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [25]:
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [26]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


In [27]:
df["preTestScore"].mean()

3.0

In [28]:
df["postTestScore"].median()

62.0

In [29]:
df["preTestScore"]

0    4.0
1    NaN
2    NaN
3    2.0
4    3.0
Name: preTestScore, dtype: float64

In [32]:
df["preTestScore"].fillna(df["preTestScore"].mean(),inplace=True)

In [33]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [35]:
df.groupby("sex")['postTestScore'].transform('mean')

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [37]:
df["postTestScore"].fillna(df.groupby('sex')['postTestScore'].transform("mean"),inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


### Category data

#### One-Hot encoding

In [58]:
edges = pd.DataFrame({'source': [0, 1, 2],
                   'target': [2, 2, 3],
                       'weight': [3, 4, 5],
                       'color': ['red', 'blue', 'blue']})

edges

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


In [59]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [60]:
pd.get_dummies(edges["color"])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [61]:
pd.get_dummies(edges[['color']])

Unnamed: 0,color_blue,color_red
0,0,1
1,1,0
2,1,0


In [62]:
edges

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


In [63]:
weight_dict = {3:"M", 4:"L", 5:"XL"}
edges["weight_sign"]=edges['weight'].map(weight_dict)
edges

Unnamed: 0,source,target,weight,color,weight_sign
0,0,2,3,red,M
1,1,2,4,blue,L
2,2,3,5,blue,XL


In [64]:
weight_sign = pd.get_dummies(edges['weight_sign'])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [65]:
pd.concat([edges,weight_sign],axis=1)

Unnamed: 0,source,target,weight,color,weight_sign,L,M,XL
0,0,2,3,red,M,0,1,0
1,1,2,4,blue,L,1,0,0
2,2,3,5,blue,XL,0,0,1


In [68]:
edges = pd.get_dummies(edges)
edges.as_matrix()

  


array([[0, 2, 3, 0, 1, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1]], dtype=int64)

## Data Binning

In [71]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}

df = pd.DataFrame(raw_data)
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [72]:
bins=[0,25,50,75,100]
group_names=['Low','Okay','Good','Great']
categories = pd.cut(df['postTestScore'],bins,labels=group_names)
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

In [73]:
df['categories']=pd.cut(df['postTestScore'],bins,labels=group_names)
pd.value_counts(df['categories'])

Good     8
Great    2
Low      2
Okay     0
Name: categories, dtype: int64

In [75]:
pd.get_dummies(df)

Unnamed: 0,preTestScore,postTestScore,regiment_Dragoons,regiment_Nighthawks,regiment_Scouts,company_1st,company_2nd,name_Ali,name_Cooze,name_Jacobson,...,name_Milner,name_Piger,name_Riani,name_Ryaner,name_Sloan,name_Sone,categories_Low,categories_Okay,categories_Good,categories_Great
0,4,25,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,24,94,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,31,57,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2,62,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,3,70,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,4,25,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,24,94,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7,31,57,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8,2,62,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
9,3,70,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


### Label encoding by sklearn

In [78]:
raw_example=df.as_matrix()
raw_example

  """Entry point for launching an IPython kernel.


array([['Nighthawks', '1st', 'Miller', 4, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', 31, 57, 'Good'],
       ['Nighthawks', '2nd', 'Milner', 2, 62, 'Good'],
       ['Dragoons', '1st', 'Cooze', 3, 70, 'Good'],
       ['Dragoons', '1st', 'Jacon', 4, 25, 'Low'],
       ['Dragoons', '2nd', 'Ryaner', 24, 94, 'Great'],
       ['Dragoons', '2nd', 'Sone', 31, 57, 'Good'],
       ['Scouts', '1st', 'Sloan', 2, 62, 'Good'],
       ['Scouts', '1st', 'Piger', 3, 70, 'Good'],
       ['Scouts', '2nd', 'Riani', 2, 62, 'Good'],
       ['Scouts', '2nd', 'Ali', 3, 70, 'Good']], dtype=object)

In [80]:
data = raw_example.copy()
data

array([['Nighthawks', '1st', 'Miller', 4, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', 31, 57, 'Good'],
       ['Nighthawks', '2nd', 'Milner', 2, 62, 'Good'],
       ['Dragoons', '1st', 'Cooze', 3, 70, 'Good'],
       ['Dragoons', '1st', 'Jacon', 4, 25, 'Low'],
       ['Dragoons', '2nd', 'Ryaner', 24, 94, 'Great'],
       ['Dragoons', '2nd', 'Sone', 31, 57, 'Good'],
       ['Scouts', '1st', 'Sloan', 2, 62, 'Good'],
       ['Scouts', '1st', 'Piger', 3, 70, 'Good'],
       ['Scouts', '2nd', 'Riani', 2, 62, 'Good'],
       ['Scouts', '2nd', 'Ali', 3, 70, 'Good']], dtype=object)

In [82]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [83]:
raw_example[:,0]

array(['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons',
       'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts',
       'Scouts'], dtype=object)

In [84]:
le.fit(raw_example[:,0])

LabelEncoder()

In [85]:
le.classes_

array(['Dragoons', 'Nighthawks', 'Scouts'], dtype=object)

In [86]:
le.transform(raw_example[:,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2])

In [88]:
data[:,0]=le.transform(raw_example[:,0])
data

array([[1, '1st', 'Miller', 4, 25, 'Low'],
       [1, '1st', 'Jacobson', 24, 94, 'Great'],
       [1, '2nd', 'Ali', 31, 57, 'Good'],
       [1, '2nd', 'Milner', 2, 62, 'Good'],
       [0, '1st', 'Cooze', 3, 70, 'Good'],
       [0, '1st', 'Jacon', 4, 25, 'Low'],
       [0, '2nd', 'Ryaner', 24, 94, 'Great'],
       [0, '2nd', 'Sone', 31, 57, 'Good'],
       [2, '1st', 'Sloan', 2, 62, 'Good'],
       [2, '1st', 'Piger', 3, 70, 'Good'],
       [2, '2nd', 'Riani', 2, 62, 'Good'],
       [2, '2nd', 'Ali', 3, 70, 'Good']], dtype=object)

In [92]:
label_column=[0,1,2,5]
label_enconder_list = []

for column_index in label_column:
    le = preprocessing.LabelEncoder()
    le.fit(raw_example[:,column_index])
    data[:,column_index]=le.transform(raw_example[:,column_index])
    label_enconder_list.append(le)
    del le
data

array([[1, 0, 4, 4, 25, 2],
       [1, 0, 2, 24, 94, 1],
       [1, 1, 0, 31, 57, 0],
       [1, 1, 5, 2, 62, 0],
       [0, 0, 1, 3, 70, 0],
       [0, 0, 3, 4, 25, 2],
       [0, 1, 8, 24, 94, 1],
       [0, 1, 10, 31, 57, 0],
       [2, 0, 9, 2, 62, 0],
       [2, 0, 6, 3, 70, 0],
       [2, 1, 7, 2, 62, 0],
       [2, 1, 0, 3, 70, 0]], dtype=object)

In [93]:
label_enconder_list[0].transform(raw_example[:10,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2])

In [94]:
one_hot_enc = preprocessing.OneHotEncoder()
data[:,0].reshape(-1,1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [95]:
one_hot_enc.fit(data[:,0].reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [96]:
one_hot_enc.n_values_



array([3])

In [97]:
one_hot_enc.active_features_



array([0, 1, 2], dtype=int64)

In [98]:
data[:,0].reshape(-1,1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [99]:
onehotlabels = one_hot_enc.transform(data[:,0].reshape(-1,1)).toarray()
onehotlabels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

## Feature scaling

In [100]:
df = pd.DataFrame({'A':[14.00,90.20,90.95,96.27,91.21],'B':[103.02,107.26,110.35,114.23,114.68], 'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


In [101]:
df["A"]

0    14.00
1    90.20
2    90.95
3    96.27
4    91.21
Name: A, dtype: float64

In [102]:
df["A"]-df["A"].min()

0     0.00
1    76.20
2    76.95
3    82.27
4    77.21
Name: A, dtype: float64

In [103]:
(df["A"]-df["A"].min())/(df["A"].max()-df["A"].min())

0    0.000000
1    0.926219
2    0.935335
3    1.000000
4    0.938495
Name: A, dtype: float64

### Min-Max Normalization

In [110]:
df["A"]=(df["A"]-df["A"].min())/(df["A"].max()-df["A"].min())*(5-1)+1
df

Unnamed: 0,A,B,C
0,1.0,103.02,big
1,4.704874,107.26,small
2,4.741339,110.35,big
3,5.0,114.23,small
4,4.753981,114.68,small


In [112]:
from sklearn import preprocessing

df=pd.io.parsers.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,usecols=[0,1,2])
df.columns=['Class label','Alcohol',"Malic acid"]
df

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.20,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59
...,...,...,...
173,3,13.71,5.65
174,3,13.40,3.91
175,3,13.27,4.28
176,3,13.17,2.59


In [113]:
std_scaler = preprocessing.StandardScaler().fit(df[['Alcohol','Malic acid']])

In [114]:
df_std = std_scaler.transform(df[["Alcohol","Malic acid"]])
df_std

array([[ 1.51861254, -0.5622498 ],
       [ 0.24628963, -0.49941338],
       [ 0.19687903,  0.02123125],
       [ 1.69154964, -0.34681064],
       [ 0.29570023,  0.22769377],
       [ 1.48155459, -0.51736664],
       [ 1.71625494, -0.4186237 ],
       [ 1.3086175 , -0.16727801],
       [ 2.25977152, -0.62508622],
       [ 1.0615645 , -0.88540853],
       [ 1.3580281 , -0.15830138],
       [ 1.38273339, -0.76871232],
       [ 0.92568536, -0.54429654],
       [ 2.16095032, -0.54429654],
       [ 1.70390229, -0.4186237 ],
       [ 0.77745356, -0.47248348],
       [ 1.60508109, -0.37374054],
       [ 1.02450655, -0.68792264],
       [ 1.46920194, -0.66996938],
       [ 0.78980621,  0.68550197],
       [ 1.3086175 , -0.63406285],
       [-0.08723191,  1.31386618],
       [ 0.87627476, -0.42760033],
       [-0.18605311, -0.66099274],
       [ 0.61686912, -0.47248348],
       [ 0.06099988, -0.25704433],
       [ 0.48098997, -0.50839001],
       [ 0.36981612, -0.55327317],
       [ 1.07391715,

In [116]:
minmax_scaler=preprocessing.MinMaxScaler().fit(df[["Alcohol","Malic acid"]])
minmax_scaler.transform(df[["Alcohol","Malic acid"]])

array([[0.84210526, 0.1916996 ],
       [0.57105263, 0.2055336 ],
       [0.56052632, 0.3201581 ],
       [0.87894737, 0.23913043],
       [0.58157895, 0.36561265],
       [0.83421053, 0.20158103],
       [0.88421053, 0.22332016],
       [0.79736842, 0.27865613],
       [1.        , 0.17786561],
       [0.74473684, 0.12055336],
       [0.80789474, 0.28063241],
       [0.81315789, 0.14624506],
       [0.71578947, 0.19565217],
       [0.97894737, 0.19565217],
       [0.88157895, 0.22332016],
       [0.68421053, 0.21146245],
       [0.86052632, 0.23320158],
       [0.73684211, 0.16403162],
       [0.83157895, 0.16798419],
       [0.68684211, 0.46640316],
       [0.79736842, 0.17588933],
       [0.5       , 0.60474308],
       [0.70526316, 0.22134387],
       [0.47894737, 0.16996047],
       [0.65      , 0.21146245],
       [0.53157895, 0.25889328],
       [0.62105263, 0.20355731],
       [0.59736842, 0.19367589],
       [0.74736842, 0.22924901],
       [0.78684211, 0.18577075],
       [0.