In [1]:
import pandas as pd
from pandas import Series
from pandas import DataFrame

import numpy as np

### Built-in functions

In [2]:
df = pd.read_csv("./wages.csv")
df.head(2).T

Unnamed: 0,0,1
earn,79571.299011,96396.988643
height,73.89,66.23
sex,male,female
race,white,white
ed,16,16
age,49,62


In [4]:
df.dtypes

earn      float64
height    float64
sex        object
race       object
ed          int64
age         int64
dtype: object

# describe

- `Numberic type` 데이터의 요약 정보 보여줌

In [5]:
df.describe()

Unnamed: 0,earn,height,ed,age
count,1379.0,1379.0,1379.0,1379.0
mean,32446.292622,66.59264,13.354605,45.328499
std,31257.070006,3.818108,2.438741,15.789715
min,-98.580489,57.34,3.0,22.0
25%,10538.790721,63.72,12.0,33.0
50%,26877.870178,66.05,13.0,42.0
75%,44506.215336,69.315,15.0,55.0
max,317949.127955,77.21,18.0,95.0


# unique

- series data의 유일한 값들을 list로 반환

In [6]:
df.race.unique()  # 유일한 인종의 값 list

array(['white', 'other', 'hispanic', 'black'], dtype=object)

In [7]:
key = df.race.unique()
value = range(len(df.race.unique()))
df["race"].replace(to_replace=key, value=value)

0       0
1       0
2       0
3       1
4       0
       ..
1374    0
1375    0
1376    0
1377    0
1378    0
Name: race, Length: 1379, dtype: int64

In [9]:
dict(enumerate(sorted(df["race"].unique())))

{0: 'black', 1: 'hispanic', 2: 'other', 3: 'white'}

In [10]:
# label index 값과 label 값 각각 추출
# label str -> index 값으로 변환

value = list(map(int, np.array(list(enumerate(df["race"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["race"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])

In [None]:
df["race"].replace(to_replace=key, value=value, inplace=True)

In [None]:
df["race"]

In [None]:
value = list(map(int, np.array(list(enumerate(df["sex"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["sex"].unique())), dtype=str)[:, 1].tolist()

value, key

In [None]:
df["sex"].replace(to_replace=key, value=value, inplace=True)
df.head(5)

In [11]:
df

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43
...,...,...,...,...,...,...
1374,30173.380363,71.68,male,white,12,33
1375,24853.519514,61.31,female,white,18,86
1376,13710.671312,63.64,female,white,12,37
1377,95426.014410,71.65,male,white,12,54


# sum

- 기본적인 column 또는 row 값의 연산을 지원
- sub, mean, min, max, count, median, mad, var 등

In [13]:
# 데이터를 다루다 보면 여러개의 column을 다룰 수 있다
numueric_cols = ["earn", "height", "ed", "age"]  # column 이름을 list 형태로 저장
df[numueric_cols]  # selection

Unnamed: 0,earn,height,ed,age
0,79571.299011,73.89,16,49
1,96396.988643,66.23,16,62
2,48710.666947,63.77,16,33
3,80478.096153,63.22,16,95
4,82089.345498,63.08,17,43
...,...,...,...,...
1374,30173.380363,71.68,12,33
1375,24853.519514,61.31,18,86
1376,13710.671312,63.64,12,37
1377,95426.014410,71.65,12,54


In [15]:
df[numueric_cols].sum(axis=1)

0       79710.189011
1       96541.218643
2       48823.436947
3       80652.316153
4       82212.425498
            ...     
1374    30290.060363
1375    25018.829514
1376    13823.311312
1377    95563.664410
1378     9686.681857
Length: 1379, dtype: float64

In [37]:
df.sum(axis=0)  # column 별

earn                                            44743437.5254
height                                               91831.25
sex         malefemalefemalefemalefemalefemalefemalemalema...
race        whitewhitewhiteotherwhitewhitewhitewhitehispan...
ed                                                      18416
age                                                     62508
sex_code                                                  520
dtype: object

In [16]:
df.sum(axis=1)  # row 별

0       79710.189011
1       96541.218643
2       48823.436947
3       80652.316153
4       82212.425498
            ...     
1374    30290.060363
1375    25018.829514
1376    13823.311312
1377    95563.664410
1378     9686.681857
Length: 1379, dtype: float64

# isnull

- column 또는 row 값의 NaN (null) 값의 index를 반환함

In [17]:
df.isnull()

Unnamed: 0,earn,height,sex,race,ed,age
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1374,False,False,False,False,False,False
1375,False,False,False,False,False,False
1376,False,False,False,False,False,False
1377,False,False,False,False,False,False


In [18]:
df.isnull().sum() / len(df)  # NaN값의 비율 확인 가능
# 빠져있는 값 전체 더해서 얼마나 비어있는지 알 수 있다

earn      0.0
height    0.0
sex       0.0
race      0.0
ed        0.0
age       0.0
dtype: float64

In [19]:
# 한 번에 보여줄 수 있는 값의 크기 조절 가능
pd.options.display.max_rows = 100

In [20]:
df

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43
...,...,...,...,...,...,...
1374,30173.380363,71.68,male,white,12,33
1375,24853.519514,61.31,female,white,18,86
1376,13710.671312,63.64,female,white,12,37
1377,95426.014410,71.65,male,white,12,54


# sort_values

- column 값을 기준으로 데이터를 sorting

In [21]:
df.sort_values(["age", "earn"], ascending=True)  # ascending=True 오름차순

Unnamed: 0,earn,height,sex,race,ed,age
1038,-56.321979,67.81,male,hispanic,10,22
800,-27.876819,72.29,male,white,12,22
963,-25.655260,68.90,male,white,12,22
1105,988.565070,64.71,female,white,12,22
801,1000.221504,64.09,female,white,12,22
...,...,...,...,...,...,...
993,32809.632677,59.61,female,other,16,92
102,39751.194030,67.14,male,white,12,93
331,39169.750135,64.79,female,white,12,95
809,42963.362005,72.94,male,white,12,95


In [22]:
df.sort_values("age", ascending=False).head(10)

Unnamed: 0,earn,height,sex,race,ed,age
3,80478.096153,63.22,female,other,16,95
331,39169.750135,64.79,female,white,12,95
809,42963.362005,72.94,male,white,12,95
102,39751.19403,67.14,male,white,12,93
993,32809.632677,59.61,female,other,16,92
1017,8942.806716,62.97,female,white,10,91
1192,39757.94721,64.79,male,white,16,90
952,8162.682672,58.09,female,white,5,89
827,55712.348432,70.13,male,white,9,88
1068,10861.092284,64.03,female,white,13,87


# Correlation & Covariance

- 상관계수와 공분산을 구하는 함수
- corr, cov, corrwith

In [24]:
df.age.corr(df.earn)  # 나이와 소득간의 상간관계
# df.age -> column, df.earn -> 비교하고자 하는 다른 column

0.07400349177836056

In [26]:
(df.age < 45) & (df.age > 15)  # boolean index

0       False
1       False
2        True
3       False
4        True
        ...  
1374     True
1375    False
1376     True
1377    False
1378     True
Name: age, Length: 1379, dtype: bool

In [27]:
df.age[(df.age < 45) & (df.age > 15)].corr(df.earn)

0.31411788725189044

In [44]:
df.age.cov(df.earn)

36523.6992104089

In [28]:
df["sex_code"] = df["sex"].replace({"male": 1, "female": 0})  # labeling coding

In [30]:
# 모든 column 간 correlation 한번에 보여준다
df.corr()  # sex를 숫자로 바꿔줘서 correlation 확인 가능

Unnamed: 0,earn,height,ed,age,sex_code
earn,1.0,0.2916,0.350374,0.074003,0.337328
height,0.2916,1.0,0.114047,-0.133727,0.703672
ed,0.350374,0.114047,1.0,-0.129802,0.061747
age,0.074003,-0.133727,-0.129802,1.0,-0.070036
sex_code,0.337328,0.703672,0.061747,-0.070036,1.0


In [31]:
df.corrwith(df.earn)

earn        1.000000
height      0.291600
ed          0.350374
age         0.074003
sex_code    0.337328
dtype: float64

In [33]:
df.sex.value_counts(sort=True)  # 특정 글자의 개수 몇개인지 (object type인 경우)

female    859
male      520
Name: sex, dtype: int64

In [35]:
df.sex.value_counts(sort=True)  / len(df)  # 비율 확인 가능

female    0.622915
male      0.377085
Name: sex, dtype: float64