In [2]:
import pandas as pd
import numpy as np

In [21]:
edges = pd.DataFrame({'source': [0, 1, 2], 
                      'target': [2, 2, 3],
                      'weight': [3, 4, 5],
                      'color': ['red', 'blue', 'blue']})

edges

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


In [22]:
edges.dtypes

source     int64
target     int64
weight     int64
color     object
dtype: object

In [23]:
edges['color']

0     red
1    blue
2    blue
Name: color, dtype: object

** pd.get_dummies()
* one hot encoding
* 하나만 True로 바꿔준다
* 문자열을 나눠서 하나만 1로 바꿔준다.
* 수치형 데이터를 숫자값으로 바꾸기 위해

In [24]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [25]:
pd.get_dummies(edges['color'])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [26]:
pd.get_dummies(edges[['color']])

Unnamed: 0,color_blue,color_red
0,0,1
1,1,0
2,1,0


In [27]:
weight_dict = {3:"M", 4:"L", 5:"XL"}
edges["weight_sign"] = edges["weight"].map(weight_dict)

edges

Unnamed: 0,source,target,weight,color,weight_sign
0,0,2,3,red,M
1,1,2,4,blue,L
2,2,3,5,blue,XL


In [28]:
weight_sign = pd.get_dummies(edges['weight_sign'])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [29]:
pd.concat([edges, weight_sign], axis = 1)

Unnamed: 0,source,target,weight,color,weight_sign,L,M,XL
0,0,2,3,red,M,0,1,0
1,1,2,4,blue,L,1,0,0
2,2,3,5,blue,XL,0,0,1


In [30]:
pd.get_dummies(edges).values
# source / target / weight / blue / red / L / M / XL

array([[0, 2, 3, 0, 1, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1]], dtype=int64)

In [None]:
# Example from - https://chrisalbon.com/python/pandas_binning_data.html

In [31]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', \
                         'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts',\
                         'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd',\
                        '1st', '1st', '2nd', '2nd'],
            'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner',\
                     'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
            'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 75, 62, 70]}

df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', \
                                       'preTestScore', 'postTestScore'])

df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,75


In [32]:
bins = [0, 25, 50, 75, 100] # Define bins as 0 to 25, 25 to 50, 60 to 75, 75 to 100
group_names = ['Low', 'Okay', 'Good', 'Great']
categories = pd.cut(df['postTestScore'], bins, labels=group_names)
# cut 범위를 설정하여 값을 넣어준다
# 구간 별로 값을 cut 해서 label 값으로 바꿔준다
# 0 ~ 25 => Low / 25 ~ 50 => Okay / 50 ~ 75 => Good / 75 ~ 100 => Great
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

In [33]:
df['categories'] = pd.cut(df['postTestScore'], bins, labels = group_names)
pd.value_counts(df['categories'])

Good     8
Great    2
Low      2
Okay     0
Name: categories, dtype: int64

In [34]:
pd.get_dummies(df)

Unnamed: 0,preTestScore,postTestScore,regiment_Dragoons,regiment_Nighthawks,regiment_Scouts,company_1st,company_2nd,name_Ali,name_Cooze,name_Jacobson,...,name_Milner,name_Piger,name_Riani,name_Ryaner,name_Sloan,name_Sone,categories_Low,categories_Okay,categories_Good,categories_Great
0,4,25,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,24,94,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,31,57,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2,62,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,3,70,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,4,25,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,24,94,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7,31,57,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8,2,62,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
9,3,75,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


### 0.0.1 using scikit-learn preprocessing

In [45]:
raw_example = df.as_matrix()
raw_example[:10]

  """Entry point for launching an IPython kernel.


array([['Nighthawks', '1st', 'Miller', 4, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', 31, 57, 'Good'],
       ['Nighthawks', '2nd', 'Milner', 2, 62, 'Good'],
       ['Dragoons', '1st', 'Cooze', 3, 70, 'Good'],
       ['Dragoons', '1st', 'Jacon', 4, 25, 'Low'],
       ['Dragoons', '2nd', 'Ryaner', 24, 94, 'Great'],
       ['Dragoons', '2nd', 'Sone', 31, 57, 'Good'],
       ['Scouts', '1st', 'Sloan', 2, 62, 'Good'],
       ['Scouts', '1st', 'Piger', 3, 75, 'Good']], dtype=object)

In [46]:
data = raw_example.copy()

In [47]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder() 
# preprocessing import 후 label encoder생성

* 1. 라벨 인코딩을 해줄 데이터가 무엇인지
* 2. Transform 을 통해 실제로 데이터를 바꿔준다

In [48]:
raw_example[:, 0]

array(['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons',
       'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts',
       'Scouts'], dtype=object)

In [49]:
le.fit(raw_example[:, 0])
# raw_example을 통해 학습한 내용을 le가 가지게 된다.

LabelEncoder()

In [50]:
le.classes_

array(['Dragoons', 'Nighthawks', 'Scouts'], dtype=object)

In [51]:
print(raw_example)

[['Nighthawks' '1st' 'Miller' 4 25 'Low']
 ['Nighthawks' '1st' 'Jacobson' 24 94 'Great']
 ['Nighthawks' '2nd' 'Ali' 31 57 'Good']
 ['Nighthawks' '2nd' 'Milner' 2 62 'Good']
 ['Dragoons' '1st' 'Cooze' 3 70 'Good']
 ['Dragoons' '1st' 'Jacon' 4 25 'Low']
 ['Dragoons' '2nd' 'Ryaner' 24 94 'Great']
 ['Dragoons' '2nd' 'Sone' 31 57 'Good']
 ['Scouts' '1st' 'Sloan' 2 62 'Good']
 ['Scouts' '1st' 'Piger' 3 75 'Good']
 ['Scouts' '2nd' 'Riani' 2 62 'Good']
 ['Scouts' '2nd' 'Ali' 3 70 'Good']]


In [52]:
data[:, 0] = le.transform(raw_example[:, 0])
print(data)
# 인코딩한 값을 각 행에 대입한다.

[[1 '1st' 'Miller' 4 25 'Low']
 [1 '1st' 'Jacobson' 24 94 'Great']
 [1 '2nd' 'Ali' 31 57 'Good']
 [1 '2nd' 'Milner' 2 62 'Good']
 [0 '1st' 'Cooze' 3 70 'Good']
 [0 '1st' 'Jacon' 4 25 'Low']
 [0 '2nd' 'Ryaner' 24 94 'Great']
 [0 '2nd' 'Sone' 31 57 'Good']
 [2 '1st' 'Sloan' 2 62 'Good']
 [2 '1st' 'Piger' 3 75 'Good']
 [2 '2nd' 'Riani' 2 62 'Good']
 [2 '2nd' 'Ali' 3 70 'Good']]


In [53]:
data[:, 0] = le.transform(raw_example[:, 0])
data[:3]

array([[1, '1st', 'Miller', 4, 25, 'Low'],
       [1, '1st', 'Jacobson', 24, 94, 'Great'],
       [1, '2nd', 'Ali', 31, 57, 'Good']], dtype=object)

In [54]:
raw_example

array([['Nighthawks', '1st', 'Miller', 4, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', 31, 57, 'Good'],
       ['Nighthawks', '2nd', 'Milner', 2, 62, 'Good'],
       ['Dragoons', '1st', 'Cooze', 3, 70, 'Good'],
       ['Dragoons', '1st', 'Jacon', 4, 25, 'Low'],
       ['Dragoons', '2nd', 'Ryaner', 24, 94, 'Great'],
       ['Dragoons', '2nd', 'Sone', 31, 57, 'Good'],
       ['Scouts', '1st', 'Sloan', 2, 62, 'Good'],
       ['Scouts', '1st', 'Piger', 3, 75, 'Good'],
       ['Scouts', '2nd', 'Riani', 2, 62, 'Good'],
       ['Scouts', '2nd', 'Ali', 3, 70, 'Good']], dtype=object)

In [29]:
label_column = [0,1,2]
label_encoder_list = []
for column_index in label_column:
    le = preprocessing.LabelEncoder()
    
    le.fit(raw_example[:,column_index])
    
    data[:,column_index] = le.transform(raw_example[:,column_index])
    
    label_encoder_list.append(le)
    del le
    
data[:12]

array([[1, 0, 4, 4, 25],
       [1, 0, 2, 24, 94],
       [1, 1, 0, 31, 57],
       [1, 1, 5, 2, 62],
       [0, 0, 1, 3, 70],
       [0, 0, 3, 4, 25],
       [0, 1, 8, 24, 94],
       [0, 1, 10, 31, 57],
       [2, 0, 9, 2, 62],
       [2, 0, 6, 3, 75],
       [2, 1, 7, 2, 62],
       [2, 1, 0, 3, 70]], dtype=object)

In [30]:
label_encoder_list[0].transform(raw_example[:10, 0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2])

In [31]:
one_hot_enc = preprocessing.OneHotEncoder()
data[:, 0].reshape(-1, 1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [32]:
one_hot_enc.fit(data[:, 0].reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [33]:
one_hot_enc.n_values_



array([3])

In [34]:
one_hot_enc.active_features_



array([0, 1, 2], dtype=int64)

In [35]:
data[:, 0].reshape(-1, 1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [36]:
onehotlabels = one_hot_enc.transform(data[:,0].reshape(-1, 1)).toarray()
onehotlabels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])