In [2]:
import pandas as pd
import numpy as np

In [2]:
import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""

In [3]:
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

In [5]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[]

In [6]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]
data

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [7]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" # 자료가 여러건 일때 하나씩 들어가게 됨
con.executemany(stmt, data)
con.commit()

In [8]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [9]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [10]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [11]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [12]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [13]:
float_data = pd.Series([1, 2, None], dtype='float64')
float_data
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [14]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [19]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [20]:
data[4] = np.nan

In [21]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [26]:
data.dropna()
data.dropna(axis=0)
data.dropna(axis=1) # columns
data.dropna(axis='columns')
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [27]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df

Unnamed: 0,0,1,2
0,0.698814,1.221532,1.464723
1,-0.661032,-1.410669,0.311626
2,-0.059065,-0.455844,-0.983683
3,0.45815,-1.414456,0.405722
4,0.5008,0.401725,-0.107593
5,-0.144579,0.562442,1.580392
6,-3.11694,1.218833,0.547644


In [29]:
df.iloc[:, 1]
df.iloc[:, 1]=np.nan

In [30]:
df

Unnamed: 0,0,1,2
0,0.698814,,1.464723
1,-0.661032,,0.311626
2,-0.059065,,-0.983683
3,0.45815,,0.405722
4,0.5008,,-0.107593
5,-0.144579,,1.580392
6,-3.11694,,0.547644


In [31]:
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.698814,,
1,-0.661032,,
2,-0.059065,,-0.983683
3,0.45815,,0.405722
4,0.5008,,-0.107593
5,-0.144579,,1.580392
6,-3.11694,,0.547644


In [34]:
df.dropna(thresh=2) # Nan이 2개 이상이면 제거

Unnamed: 0,0,1,2
2,-0.059065,,-0.983683
3,0.45815,,0.405722
4,0.5008,,-0.107593
5,-0.144579,,1.580392
6,-3.11694,,0.547644


In [35]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.698814,0.0,0.0
1,-0.661032,0.0,0.0
2,-0.059065,0.0,-0.983683
3,0.45815,0.0,0.405722
4,0.5008,0.0,-0.107593
5,-0.144579,0.0,1.580392
6,-3.11694,0.0,0.547644


In [37]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.698814,0.5,0.0
1,-0.661032,0.5,0.0
2,-0.059065,0.5,-0.983683
3,0.45815,0.5,0.405722
4,0.5008,0.5,-0.107593
5,-0.144579,0.5,1.580392
6,-3.11694,0.5,0.547644


In [38]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.758734,-1.887171,-0.319854
1,1.233197,-0.1448,-0.483029
2,-1.445837,,-0.53739
3,-0.629263,,-1.274791
4,0.75399,,
5,-0.201751,,


In [39]:
df.fillna(method="ffill") # 이전에 발생한 값으로 대체

Unnamed: 0,0,1,2
0,0.758734,-1.887171,-0.319854
1,1.233197,-0.1448,-0.483029
2,-1.445837,-0.1448,-0.53739
3,-0.629263,-0.1448,-1.274791
4,0.75399,-0.1448,-1.274791
5,-0.201751,-0.1448,-1.274791


In [40]:
# df.fillna(method="bfill") # 밑에 있는걸로 대체

Unnamed: 0,0,1,2
0,0.758734,-1.887171,-0.319854
1,1.233197,-0.1448,-0.483029
2,-1.445837,,-0.53739
3,-0.629263,,-1.274791
4,0.75399,,
5,-0.201751,,


In [41]:
df.fillna(method="ffill", limit=2)

Unnamed: 0,0,1,2
0,0.758734,-1.887171,-0.319854
1,1.233197,-0.1448,-0.483029
2,-1.445837,-0.1448,-0.53739
3,-0.629263,-0.1448,-1.274791
4,0.75399,,-1.274791
5,-0.201751,,-1.274791


In [43]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [44]:
data.mean()

3.8333333333333335

In [47]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [48]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [49]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [50]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [51]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [52]:
data.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [53]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [54]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [56]:
data.drop_duplicates(subset=["k1", "k2"]) # data.drop_duplicates(["k1","k2"]) 

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [57]:
data.drop_duplicates(["k1","k2"], keep='last') # keep='first'는 반대

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [58]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [59]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                              "pastrami", "corned beef", "bacon",
                              "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [65]:
data['animal'] = data['food'].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [66]:
def get_animal(x):
    return meat_to_animal[x]
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [67]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [69]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [70]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [71]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [72]:
data.replace({-999: np.nan, -1000: 0}) # 딕셔너리 형식으로도 동일

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [73]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [78]:
'test'.upper()
data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

In [84]:
def trans(x):
    return x[:4].upper() # + x[4:]
data.index = data.index.map(trans)

In [85]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [87]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [88]:
# 연속형 : 연속된 숫자로 구성된 데이터
# 범주형 : 클래스로 나뉘어 지는 데이터

In [90]:
"""
변수 : 연속변수, 범주형 변수
연속형 변수 : 연속적인 값(나이, 점수, 몸무게, ...)
범주형 변수 : 이산적인 값(서로 다른 것으로 구분할 수 있는 변수,
                          성별, 혈액형, 학점(a,b,c 같은) ...)
"""

'\n변수 : 연속변수, 범주형 변수\n연속형 변수 : 연속적인 값(나이, 점수, 몸무게, ...)\n범주형 변수 : 이산적인 값(서로 다른 것으로 구분할 수 있는 변수,\n                          성별, 혈액형, 학점(a,b,c 같은) ...)\n'

In [91]:
bins = [18, 25, 35, 60, 100]

In [93]:
age_categories = pd.cut(ages, bins)
age_categories # ( ] : 개구간(초과), 폐구간 (이하) (18, 25] 18초과 25이하 interval[right]면 오른쪽이 포함

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [98]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [99]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [100]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [101]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [102]:
pd.cut(ages, bins)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [103]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
group_names

['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [104]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [105]:
data = np.random.uniform(size=20)
data

array([0.51381832, 0.74702532, 0.55295473, 0.59574323, 0.57431039,
       0.76764562, 0.54218521, 0.04446264, 0.87666132, 0.85866182,
       0.41699067, 0.70030655, 0.24218728, 0.33477279, 0.87571081,
       0.25623733, 0.04310376, 0.38959704, 0.05368302, 0.76767158])

In [107]:
data_cate = pd.cut(data, 4)
data_cate

[(0.46, 0.668], (0.668, 0.877], (0.46, 0.668], (0.46, 0.668], (0.46, 0.668], ..., (0.251, 0.46], (0.0423, 0.251], (0.251, 0.46], (0.0423, 0.251], (0.668, 0.877]]
Length: 20
Categories (4, interval[float64, right]): [(0.0423, 0.251] < (0.251, 0.46] < (0.46, 0.668] < (0.668, 0.877]]

In [108]:
data_cate.codes

array([2, 3, 2, 2, 2, 3, 2, 0, 3, 3, 1, 3, 0, 1, 3, 1, 0, 1, 0, 3],
      dtype=int8)

In [110]:
# cut은 구간 경계선을 지정하여 나눔
# qcut은 각 구간에 속한 데이터의 개수가 동일하게 지정하여 나눔
pd.value_counts(data_cate.codes)

3    7
2    5
0    4
1    4
dtype: int64

In [112]:
data_cateq=pd.qcut(data,4)
data_cateq

[(0.315, 0.548], (0.548, 0.752], (0.548, 0.752], (0.548, 0.752], (0.548, 0.752], ..., (0.0421, 0.315], (0.0421, 0.315], (0.315, 0.548], (0.0421, 0.315], (0.752, 0.877]]
Length: 20
Categories (4, interval[float64, right]): [(0.0421, 0.315] < (0.315, 0.548] < (0.548, 0.752] < (0.752, 0.877]]

In [114]:
pd.value_counts(data_cateq.codes)

1    5
2    5
3    5
0    5
dtype: int64

In [115]:
data_cateq=pd.qcut(data,4, labels=['q1', 'q2', 'q3', 'q4'])
data_cateq

['q2', 'q3', 'q3', 'q3', 'q3', ..., 'q1', 'q1', 'q2', 'q1', 'q4']
Length: 20
Categories (4, object): ['q1' < 'q2' < 'q3' < 'q4']

In [117]:
# pd.value_counts(data_cateq.codes)
pd.value_counts(data_cateq)

q1    5
q2    5
q3    5
q4    5
dtype: int64

In [118]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.027061,-0.01103,-0.037769,0.025088
std,0.977816,1.007019,0.992458,1.035058
min,-3.554478,-2.895638,-3.28002,-3.335491
25%,-0.666112,-0.6826,-0.704263,-0.636967
50%,-0.064819,-0.027517,-0.032044,0.018608
75%,0.616444,0.640009,0.574898,0.721516
max,3.331509,4.259046,4.315506,3.190668


In [121]:
data[data[2].abs()>3]

Unnamed: 0,0,1,2,3
207,-0.250071,2.193934,-3.28002,-0.1305
491,-0.591486,-0.451518,4.315506,-1.052104
907,1.135878,1.232679,-3.115891,-0.132759


In [124]:
np.sign(data)

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,1.0,1.0,-1.0,1.0
2,1.0,-1.0,1.0,-1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,-1.0,1.0,1.0
...,...,...,...,...
995,-1.0,-1.0,1.0,-1.0
996,-1.0,1.0,1.0,1.0
997,-1.0,1.0,1.0,1.0
998,-1.0,1.0,-1.0,1.0


In [125]:
np.sign(data)*3

Unnamed: 0,0,1,2,3
0,-3.0,3.0,-3.0,-3.0
1,3.0,3.0,-3.0,3.0
2,3.0,-3.0,3.0,-3.0
3,3.0,-3.0,3.0,-3.0
4,3.0,-3.0,3.0,3.0
...,...,...,...,...
995,-3.0,-3.0,3.0,-3.0
996,-3.0,3.0,3.0,3.0
997,-3.0,3.0,3.0,3.0
998,-3.0,3.0,-3.0,3.0


In [127]:
data[data.abs()>3] = np.sign(data)*3
# data에 담긴 값의 절대값이 3보다 큰 경우에는 모두 3또는 -3으로 대체

In [128]:
data

Unnamed: 0,0,1,2,3
0,-0.431819,0.640109,-1.015492,-0.102069
1,0.205754,1.259136,-0.340301,0.316923
2,0.034530,-2.122306,0.543841,-1.895645
3,0.235190,-0.427374,1.298713,-0.657946
4,0.037414,-0.630636,1.057395,2.265833
...,...,...,...,...
995,-0.453239,-1.172489,0.697012,-0.711524
996,-0.491602,0.467300,0.210278,0.403156
997,-0.693923,1.949160,1.459147,0.086566
998,-0.277157,0.653810,-1.086419,0.009549


In [129]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [132]:
sampler = np.random.permutation(5)
sampler

array([3, 4, 0, 2, 1])

In [133]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13


In [138]:
df.take(sampler)
# take는 데이터를 행 또는 열 단위로 추출할 때 빠르게 수행, 멀티 인덱스는 지원하지 않는다.
df.take(sampler, axis=0)
# df.take(sampler, axis=1)

Unnamed: 0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13


In [139]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [140]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20


In [141]:
choices = pd.Series([5, 7, -1, 6, 4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [142]:
choices.sample(n=3)

2   -1
4    4
3    6
dtype: int64

In [143]:
choices.sample(n=5) # 비복원추출

2   -1
0    5
3    6
1    7
4    4
dtype: int64

In [145]:
choices.sample(n=10, replace=True) # 복원추출

0    5
2   -1
3    6
2   -1
1    7
2   -1
2   -1
3    6
1    7
1    7
dtype: int64

In [156]:
# 이번주 로또번호 5셋 출력
lotto = pd.Series(np.arange(1,46))
for _ in range(5):
    print(lotto.sample(n=6).values)

[ 8 38 35 25 11 39]
[15 36 43 23 17 35]
[26  6 41 18  5 24]
[ 9  4 12 24 32 35]
[24 21 12 14  4 38]


In [158]:
df = pd.DataFrame(np.arange(30).reshape(5, 6))
for i in range(5):
    choices = pd.Series(np.arange(1, 46))
    sample = choices.sample(n=6)
    df.iloc[i, :] = sample
df

Unnamed: 0,0,1,2,3,4,5
0,45,11,5,22,40,7
1,19,30,42,41,14,29
2,24,34,40,3,32,27
3,22,23,10,24,20,18
4,35,10,8,32,26,39


In [159]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                   "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [160]:
# 범주형 문자 데이터 -> 수치 변환(범주)
df['key']

0    b
1    b
2    a
3    c
4    a
5    b
Name: key, dtype: object

In [163]:
# get_dummies : 범주형 문자 데이터 -> 수치변환(원 핫 인코딩)
pd.get_dummies(df['key'])
pd.get_dummies(df['key'], dtype='int')

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [164]:
pd.get_dummies(df['key'], dtype='int', prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [165]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [166]:
dummies=pd.get_dummies(df['key'], dtype='int', prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [174]:
df[['data1']].join(dummies)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [175]:
dummies.join(df[['data1']])

Unnamed: 0,key_a,key_b,key_c,data1
0,0,1,0,0
1,0,1,0,1
2,1,0,0,2
3,0,0,1,3
4,1,0,0,4
5,0,1,0,5


In [183]:
mnames = ["movie_id", "title", "genres"]
# pd.read_csv('data/datasets/movielens/movies.dat', names=mnames, sep='::', engine='python', header=None)
movies = pd.read_table('data/datasets/movielens/movies.dat', names=mnames, sep='::', engine='python', header=None)

# read_csv : 기본 컴마로 구분되도록 정의
# read_table : 텍스트 파일 읽을 때 사용, sep 반드시 있어야 함

In [184]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [197]:
# movies['genres'].split(sep='|') 에러
movies['genres'].str.split('|') # 시리즈 내에 저장된 문자열 -> 문자열 함수
# def ch(x):
#     return x.split('|')
# movies.loc[:, 'genres'].apply(ch)


0        [Animation, Children's, Comedy]
1       [Adventure, Children's, Fantasy]
2                      [Comedy, Romance]
3                        [Comedy, Drama]
4                               [Comedy]
                      ...               
3878                            [Comedy]
3879                             [Drama]
3880                             [Drama]
3881                             [Drama]
3882                   [Drama, Thriller]
Name: genres, Length: 3883, dtype: object

In [199]:
'Animation'.lower()

'animation'

In [203]:
movies['genres'].str.lower() # 소문자로 변환

0        animation|children's|comedy
1       adventure|children's|fantasy
2                     comedy|romance
3                       comedy|drama
4                             comedy
                    ...             
3878                          comedy
3879                           drama
3880                           drama
3881                           drama
3882                  drama|thriller
Name: genres, Length: 3883, dtype: object

In [207]:
#movies['genres'].str.split("|")
movies['genres'].str.get_dummies("|")

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [208]:
dummies = movies['genres'].str.get_dummies("|")
dummies

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [209]:
dummies.iloc[:10, :6]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


In [210]:
dummies.add_prefix("Genre_")

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [211]:
movies.join(dummies.add_prefix("Genre_"))

Unnamed: 0,movie_id,title,genres,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Crime,Genre_Documentary,...,Genre_Fantasy,Genre_Film-Noir,Genre_Horror,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [212]:
np.random.seed(12345)
values = np.random.uniform(size=10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [213]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [214]:
pd.cut(values, bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [216]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [219]:
s = pd.Series([1, 2, 3, None])
s
# s.dtype

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [218]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [220]:
df = pd.DataFrame({"A": [1, 2, None, 4],
                   "B": ["one", "two", "three", None],
                   "C": [False, None, False, True]})
df
# None : 숫자들과 섞여잇으면 NaN, 문자나 bool과 함께면 None

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [221]:
df.dtypes

A    float64
B     object
C     object
dtype: object

In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      object 
 2   C       3 non-null      object 
dtypes: float64(1), object(2)
memory usage: 224.0+ bytes


In [229]:
df['A'] = df['A'].astype('Int64')
df['C'].astype('boolean')

0    False
1     <NA>
2    False
3     True
Name: C, dtype: boolean

In [230]:
val = "a,b,  guido"
val.split(",")

['a', 'b', '  guido']

In [231]:
pieces = [x.strip() for x in val.split(",")]
pieces

['a', 'b', 'guido']

In [232]:
first, second, third = pieces

In [233]:
first + "::" + second + "::" + third

'a::b::guido'

In [234]:
"::".join(pieces)

'a::b::guido'

In [238]:
df
df.count()

A    3
B    3
C    3
dtype: int64

In [287]:
import seaborn as sns
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [288]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [289]:
# 1.타이타닉호 승객 데이터의 데이터 개수를 각 열마다 구해본다.
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [290]:
# 2.sort_values 메서드를 사용하여 타이타닉호 승객에 대해 성별(sex) 인원수, 나이별(age) 인원수, 선실별(class) 인원수, 사망/생존 인원수를 출력하시오.
titanic.sex.value_counts()
titanic.age.value_counts()
titanic['class'].value_counts()
titanic.alive.value_counts()

no     549
yes    342
Name: alive, dtype: int64

In [291]:
# 3.타이타닉호 승객의 평균 나이를 구하라.
titanic['age'].mean()
# round(titanic.age.mean(), 1)

29.69911764705882

In [292]:
# 4.타이타닉호 승객중 여성 승객의 평균 나이를 구하라.
# fage = titanic[titanic['sex']=='female']
# fage['age'].mean()
round(titanic[titanic.sex='female']['age'].mean(),1)

27.915708812260537

In [267]:
# 5.타이타닉호 승객중 1등실 선실의 여성 승객의 평균 나이를 구하라.
fage2 = fage[fage['class']=='First']
fage2['age'].mean()

round(titanic[(titanic.sex=="female") & (titanic.pclass==1)]['age'].mean(),1)

34.61176470588235

In [293]:
# 6.타이타닉호의 승객에 대해 나이와 성별에 의한 카테고리 열인 category1 열을 만들어라. category1 카테고리는 다음과 같이 정의된다.
# - 20살이 넘으면 성별을 그대로 사용한다.
# - 20살 미만이면 성별에 관계없이 “child”라고 한다.
def d(x):
    if x['age'] < 20:
        return 'child'
    else:
        return x['sex']
titanic['category1'] = titanic.apply(d, axis=1)

In [None]:
titanic.apply(lambda x: x.sex if x.age >= 20 else 'child', axis=1)

In [294]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,category1
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,male
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,female
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,female
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,female
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,male
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,child
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,female
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,male


In [295]:
# 7.타이타닉호의 승객 중 나이를 명시하지 않은 고객은 나이를 명시한 고객의 평균 나이 값이 되도록 titanic 데이터프레임을 고쳐라.
titanic['age'] = titanic.age.fillna(titanic.age.mean())
titanic
# titanic.fillna({'age':titanic.age.mean()}, inplace=True)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,category1
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,,Southampton,no,False,male
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,female
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,female
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,female
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,,Southampton,no,True,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,,Southampton,no,True,male
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,child
888,0,3,female,29.699118,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,female
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,male


In [None]:
# titanic.age.isnull().sum()
# len(titanic.age)
titanic.age[titanic.age.isnull()] = titanic.age.mean()

In [3]:
col  = ['col1','col2','col3','col4','col5']
row  = ['row1','row2','row3','row4','row5']
na = np.nan
data = [[na, 2,na, 4,na],
        [ 6, 7,na, 9,na],
        [11,na,na,14,15],
        [na,17,na,na,20],
        [na,22,na,na,25]]
df = pd.DataFrame(data,row,col)
df

Unnamed: 0,col1,col2,col3,col4,col5
row1,,2.0,,4.0,
row2,6.0,7.0,,9.0,
row3,11.0,,,14.0,15.0
row4,,17.0,,,20.0
row5,,22.0,,,25.0


In [6]:
# df.fillna('a')
df['col1'].fillna('a')
df.fillna({'col1':'a', 'col3':'b'})

Unnamed: 0,col1,col2,col3,col4,col5
row1,a,2.0,b,4.0,
row2,6.0,7.0,b,9.0,
row3,11.0,,b,14.0,15.0
row4,a,17.0,b,,20.0
row5,a,22.0,b,,25.0


In [298]:
# 8. 타이타닉호의 승객에 대해 나이와 성별에 의한 카테고리 열인 category2 열을 만들어라. category2 카테고리는 다음과 같이 정의된다.
# - 성별을 나타내는 문자열 male 또는 female로 시작한다.
# - 성별을 나타내는 문자열 뒤에 나이를 나타내는 문자열이 온다.
# *예를 들어 27살 남성은 male27 값이 된다.
def d(x):
    return str(x.sex) + str(x['age'])
titanic.apply(d, axis=1)

0                     male22.0
1                   female38.0
2                   female26.0
3                   female35.0
4                     male35.0
                ...           
886                   male27.0
887                 female19.0
888    female29.69911764705882
889                   male26.0
890                   male32.0
Length: 891, dtype: object

In [None]:
titanic['category2'] = titanic.sex + titanic.age.astype('int').astype('str')

In [301]:
# 9.타이타닉호 승객을 ‘미성년자’, ‘청년’, ‘중년’, ‘장년’, ‘노년’ 나이 그룹으로 나눈다.
bins = [1, 20, 30, 50, 70, 100]
labels = ["미성년자", "청년", "중년", "장년", "노년"]
# 그리고 각 나이 그룹의 승객 비율을 구한다. 비율의 전체 합은 1이 되어야 한다.
age_categories = pd.cut(titanic.age, bins, labels=labels)
titanic['age_group'] = age_categories

In [302]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,category1,age_group
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,,Southampton,no,False,male,청년
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,female,중년
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,female,청년
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,female,중년
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,,Southampton,no,True,male,중년
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,,Southampton,no,True,male,청년
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,child,미성년자
888,0,3,female,29.699118,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,female,청년
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,male,청년


In [None]:
bins = [1, 20, 30, 50, 70, 100]
labels = ["미성년자", "청년", "중년", "장년", "노년"]
titanic['연령대'] = pd.cut(age, bins, labels=labels
titanic.연령대.value_counts()

In [None]:
pd.DataFrame(titanic.연령대.value_counts())

In [305]:
# 10. 타이타닉호의 승객에 대해 나이와 성별에 의한 카테고리 열인 category3 열을 만들어라. category3 카테고리는 다음과 같이 정의된다.
# - 20살 미만이면 성별에 관계없이 “미성년자”라고 한다.
# - 20살 이상이면 나이에 따라 “청년”, “중년”, “장년”, “노년”을 구분하고 그 뒤에 성별을 나타내는 “남성”, “여성”을 붙인다.
def d(x):
    if x.age < 20:
        return x.age_group
    else:
        return x.age_group + x.sex
titanic['category3'] = titanic.apply(d, axis=1)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,category1,age_group,category3
0,0,3,male,22.000000,1,0,7.2500,S,Third,man,True,,Southampton,no,False,male,청년,청년male
1,1,1,female,38.000000,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,female,중년,중년female
2,1,3,female,26.000000,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,female,청년,청년female
3,1,1,female,35.000000,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,female,중년,중년female
4,0,3,male,35.000000,0,0,8.0500,S,Third,man,True,,Southampton,no,True,male,중년,중년male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,Second,man,True,,Southampton,no,True,male,청년,청년male
887,1,1,female,19.000000,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,child,미성년자,미성년자
888,0,3,female,29.699118,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,female,청년,청년female
889,1,1,male,26.000000,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,male,청년,청년male


In [None]:
bins = [0,20,30,50,70,100]
labels = ["미성년자", "청년", "중년", "장년", "노년"]

In [None]:
titanic['age_label'] = pd.cut(titanic.age, bins, labels).astype(str)
titanic

In [None]:
titanic['성별']=titanic.apply(lambda x:'남성' if x.sex=='male' else '여성',axis=1)

In [None]:
titanic['category3_1']=titanic.apply(lambda x: '미성년자' if x.age <20 else x.age_label+x.성별,axis=1)