In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
s1 = Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [3]:
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [6]:
# dict type으로 데이터 교체
# 없는 값은 NaN
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z)

0    NaN
1      A
2      B
3      C
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [7]:
# 같은 위치 데이터를 s2로 전환
s2 = Series(np.arange(10, 20))
s1.map(s2)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

### replace function

In [None]:
df.sex.replace(
    {"male":0, "female":1}
).head()

In [None]:
df.sex.replace(
    ["male", "female"],
    [0, 1], inplace=True)

In [None]:
def change_sex(x):
    return 0 if x == "male" else 1

df.sex.map(change_sex)

In [11]:
df = DataFrame(np.arange(25).reshape(5, 5))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


### apply
* map과 달리, series 전체(column)에 해당 함수를 적용
* 입력 값이 series 데이터로 입력 받아 handling 가능

In [13]:
df = pd.read_csv("./housing.data", header=None, sep="\s+")
df.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]

In [15]:
df_info = df[["INDUS", "TAX", "AGE"]]
df_info.head()

Unnamed: 0,INDUS,TAX,AGE
0,2.31,296.0,65.2
1,7.07,242.0,78.9
2,7.07,242.0,61.1
3,2.18,222.0,45.8
4,2.18,222.0,54.2


In [16]:
f = lambda x : x.max() - x.min()
df_info.apply(f)

INDUS     27.28
TAX      524.00
AGE       97.10
dtype: float64

In [17]:
df_info.sum()

INDUS      5635.21
TAX      206568.00
AGE       34698.90
dtype: float64

In [18]:
df_info.mean()

INDUS     11.136779
TAX      408.237154
AGE       68.574901
dtype: float64

In [19]:
def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])
df_info.apply(f)

Unnamed: 0,INDUS,TAX,AGE
min,0.46,187.0,2.9
max,27.74,711.0,100.0


In [20]:
f = lambda x: -x
df_info.applymap(f).head(5)

Unnamed: 0,INDUS,TAX,AGE
0,-2.31,-296.0,-65.2
1,-7.07,-242.0,-78.9
2,-7.07,-242.0,-61.1
3,-2.18,-222.0,-45.8
4,-2.18,-222.0,-54.2


In [21]:
f = lambda x: -x
df_info["AGE"].apply(f).head(5)

0   -65.2
1   -78.9
2   -61.1
3   -45.8
4   -54.2
Name: AGE, dtype: float64

### pandas built-in functions

In [22]:
# Numeric type 데이터의 요약 정보를 보여줌
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [25]:
# Series data의 유일한 값을 list로 반환
df.AGE.unique()

array([ 65.2,  78.9,  61.1,  45.8,  54.2,  58.7,  66.6,  96.1, 100. ,
        85.9,  94.3,  82.9,  39. ,  61.8,  84.5,  56.5,  29.3,  81.7,
        36.6,  69.5,  98.1,  89.2,  91.7,  94.1,  85.7,  90.3,  88.8,
        94.4,  87.3,  82. ,  95. ,  96.9,  68.2,  61.4,  41.5,  30.2,
        21.8,  15.8,   2.9,   6.6,   6.5,  40. ,  33.8,  33.3,  85.5,
        95.3,  62. ,  45.7,  63. ,  21.1,  21.4,  47.6,  21.9,  35.7,
        40.5,  29.2,  47.2,  66.2,  93.4,  67.8,  43.4,  59.5,  17.8,
        31.1,  36.8,  33. ,  17.5,   7.8,   6.2,   6. ,  45. ,  74.5,
        53.7,  33.5,  70.4,  32.2,  46.7,  48. ,  56.1,  45.1,  56.8,
        86.3,  63.1,  66.1,  73.9,  53.6,  28.9,  77.3,  57.8,  69.6,
        76. ,  36.9,  62.5,  79.9,  71.3,  85.4,  87.4,  90. ,  96.7,
        91.9,  85.2,  97.1,  91.2,  54.4,  81.6,  92.9,  95.4,  84.2,
        88.2,  72.5,  82.6,  73.1,  69.7,  84.1,  97. ,  95.8,  88.4,
        95.6,  96. ,  98.8,  94.7,  98.9,  97.7,  97.9,  98.4,  98.2,
        93.5,  93.6,

In [26]:
# dict type으로 index
np.array(dict(enumerate(df["AGE"].unique())))

array({0: 65.2, 1: 78.9, 2: 61.1, 3: 45.8, 4: 54.2, 5: 58.7, 6: 66.6, 7: 96.1, 8: 100.0, 9: 85.9, 10: 94.3, 11: 82.9, 12: 39.0, 13: 61.8, 14: 84.5, 15: 56.5, 16: 29.3, 17: 81.7, 18: 36.6, 19: 69.5, 20: 98.1, 21: 89.2, 22: 91.7, 23: 94.1, 24: 85.7, 25: 90.3, 26: 88.8, 27: 94.4, 28: 87.3, 29: 82.0, 30: 95.0, 31: 96.9, 32: 68.2, 33: 61.4, 34: 41.5, 35: 30.2, 36: 21.8, 37: 15.8, 38: 2.9, 39: 6.6, 40: 6.5, 41: 40.0, 42: 33.8, 43: 33.3, 44: 85.5, 45: 95.3, 46: 62.0, 47: 45.7, 48: 63.0, 49: 21.1, 50: 21.4, 51: 47.6, 52: 21.9, 53: 35.7, 54: 40.5, 55: 29.2, 56: 47.2, 57: 66.2, 58: 93.4, 59: 67.8, 60: 43.4, 61: 59.5, 62: 17.8, 63: 31.1, 64: 36.8, 65: 33.0, 66: 17.5, 67: 7.8, 68: 6.2, 69: 6.0, 70: 45.0, 71: 74.5, 72: 53.7, 73: 33.5, 74: 70.4, 75: 32.2, 76: 46.7, 77: 48.0, 78: 56.1, 79: 45.1, 80: 56.8, 81: 86.3, 82: 63.1, 83: 66.1, 84: 73.9, 85: 53.6, 86: 28.9, 87: 77.3, 88: 57.8, 89: 69.6, 90: 76.0, 91: 36.9, 92: 62.5, 93: 79.9, 94: 71.3, 95: 85.4, 96: 87.4, 97: 90.0, 98: 96.7, 99: 91.9, 100: 85.