In [1]:
# 데이터가 없을 때 할 수 있는 전략
# 데이터가 없으면 sample을 drop
# 데이터가 없는 최소 개수를 정해서 sample을 drop
# 데이터가 거의 없는 feature는 feature 자체를 drop
# 최빈값, 평균값으로 비어있는 데이터를 채우기

In [3]:
import numpy as np
import pandas as pd

In [4]:
raw_data = {'first_name' : ['Jason', np.nan, 'Tina', 'Jake','Amy'],
           'last_name' : ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
           'age': [42, np.nan, 36,24,73],
            'sex': ['m', np.nan, 'f', 'm', 'f'],
            'preTestScore':[4, np.nan, np.nan, 2, 3],
            'postTestScore':[25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore','postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [5]:
df.isnull().sum() # NaN이 데이터를 column별로 합계

first_name       1
last_name        1
age              1
sex              1
preTestScore     2
postTestScore    2
dtype: int64

In [6]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [7]:
df_cleaned = df.dropna(how='all') # 모든 데이터가 비어 있으면 drop
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [8]:
df['location'] = np.nan # NaN을 생성 column
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [9]:
df.dropna(axis=1, thresh=3) # 데이터가 최소 4개 이상 없을 때 drop

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [10]:
df.dropna(axis=1, how='all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [11]:
df.dropna(thresh=5) # 5개 이상 데이터가 있지 않으면 drop

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [13]:
# 데이터 채우기

In [15]:
df.fillna(0) # 데이터가 없는 곳은 0으로 집어넣어라

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


In [17]:
df['preTestScore'].fillna(df["preTestScore"].mean(), inplace=True) # 빈곳에 preTestScore의 평균값을 집어넣어라
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [19]:
df['postTestScore'].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace =True) # 성별로 나눠서 평균 값을 집어 넣어라
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [20]:
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [21]:
# One-Hot Encoding
import pandas as pd
import numpy as np

In [22]:
edges = pd.DataFrame({'source':[0,1,2], 'target':[2,2,3],'weight':[3,4,5],'color':['red','blue','blue']})

In [23]:
edges['source']

0    0
1    1
2    2
Name: source, dtype: int64

In [24]:
edges['color']

0     red
1    blue
2    blue
Name: color, dtype: object

In [25]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [26]:
weight_dict = {3:'M',4:'L',5:'XL'}
edges['weight_sign']=edges['weight'].map(weight_dict)
edges

Unnamed: 0,source,target,weight,color,weight_sign
0,0,2,3,red,M
1,1,2,4,blue,L
2,2,3,5,blue,XL


In [28]:
edges = pd.get_dummies(edges)
edges.as_matrix()

  


array([[0, 2, 3, 0, 1, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1]], dtype=int64)

In [29]:
# Example from - https://chrisalbon.com/python/pandas_binning_data.html

raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [30]:
# Data binning
bins = [0,25,50,75,100] # define bins as 0 to 25, 25 to 50, 50 to 75, 75 to 100
group_names = ['Low','Okay','Good','Great']
categories = pd.cut(df['postTestScore'], bins, labels=group_names)

In [32]:
df['categories'] = pd.cut(df['postTestScore'],bins, labels=group_names)
pd.value_counts(df['categories'])

Good     8
Great    2
Low      2
Okay     0
Name: categories, dtype: int64

In [33]:
# featuure scaling
# min-max: 최소 최대를 0,1로 잡아서
# standardization: 기존 변수에 범위를 정규 분포로 변환
# from sklearn import preprocessing
# std_scale = preprocessing.StandardScaler().fit(df[['Alcohol','Malic acid']])
# df_std = std_scale.transform(df[['Alcohol','Malic acid']])
# df_std[:5]
# Preprocessing은 모두 fit > transform의 과정을 거침
# fit은 규칙을 생성하는 과정
# Transform은 규칙을 적용하는 과정