# pandas에는 대표적으로 두가지 데이터 타입이 존재합니다.

1. Series
2. DataFrame

# Series 데이터 타입

In [2]:
import pandas as pd

In [8]:
# 홀수만 담긴 리스트 생성
odd = [i for i in range(1, 10, 2)]
odd

[1, 3, 5, 7, 9]

In [9]:
# pd.Series(data) 시리즈 데이터로 변환
odd = pd.Series(odd)
print(type(odd))
odd

<class 'pandas.core.series.Series'>


0    1
1    3
2    5
3    7
4    9
dtype: int64

In [12]:
# Series 데이터 mean, describe 등 기본 통계치
odd.describe()

count    5.000000
mean     5.000000
std      3.162278
min      1.000000
25%      3.000000
50%      5.000000
75%      7.000000
max      9.000000
dtype: float64

In [14]:
odd.mean()

5.0

# DataFrame 데이터 타입

In [15]:
# DataFrame 생성, 2차원 리스트, 딕셔너리를 이용
number = [
    [1,2,3],
    [4,5,6],
    [7,8,9]
]

In [16]:
number = pd.DataFrame(number)
print(type(number))
number

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


# Series 와 DataFrame의 관계

In [24]:
type(number)

pandas.core.frame.DataFrame

In [25]:
type(number[0])

pandas.core.series.Series

DataFrame은 여러개의 Series로 이루어져있다.

# Pandas Data Type

### NaN(Not a Number)

In [20]:
import numpy as np
import pandas as pd

odd = [1, np.nan, 5, 7, 90]

odd = pd.Series(odd)

odd.dtypes

# nan은 float형으로 나온다.

dtype('float64')

In [21]:
type(np.nan)

float

nan을 -1로 저장하는 경우도 있는데,데이터 저장은 용이할수있지만 분석에서는 통계치가 왜곡되는 문제가 생길 수 있다.

In [22]:
np.nan == np.nan # nan끼리는 비교할수가없다.

False

nan의 경우 별도처리를 해야할 경우가 있다.

In [23]:
val = np.nan

print(pd.isnull(val))

print(pd.notnull(val))

True
False


# DataFrame을 생성하는 방법

## 생성하기

In [26]:
# rank가 2인 리스트를 데이터프레임으로 변형할 수 있지만
# 컬럼 이름을 바로 수정할 수 없고, 불편하기 때문에 자주 쓰이지는 않는다.

order = [
    ["2017-01-02", 300, "confirmed"],
    ["2017-01-03", 500, "confirmed"],
    ["2017-01-04", 700, "canceled"]
]

order = pd.DataFrame(order)

order

Unnamed: 0,0,1,2
0,2017-01-02,300,confirmed
1,2017-01-03,500,confirmed
2,2017-01-04,700,canceled


In [27]:
order = [
    ["2017-01-02", 300, "confirmed"],
    ["2017-01-03", 500, "confirmed"],
    ["2017-01-04", 700, "canceled"]
]

columns = ["date", "price", "state"]

order = pd.DataFrame(order, columns = columns)

order

Unnamed: 0,date,price,state
0,2017-01-02,300,confirmed
1,2017-01-03,500,confirmed
2,2017-01-04,700,canceled


In [28]:
# 딕셔너리형으로 선언해서 DataFrame으로 변형 가능.

order = {
    "data" : ["2017-01-02", "2017-01-03", "2017-01-04"],
    "price" : [300, 500, 700],
    "state" : ["confirmed", "confirmed", "canceled"]
}

order = pd.DataFrame(order)

order

Unnamed: 0,data,price,state
0,2017-01-02,300,confirmed
1,2017-01-03,500,confirmed
2,2017-01-04,700,canceled


# DataFrame의 기본

In [30]:
# pandas 는 주소에 있는 데이터를 바로 불러오는 기능도 제공
# pd.read_csv(), train데이터 불러오기
train = pd.read_csv("train.csv")
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [33]:
# 위 데이터를 보면 , id는 유니크하므로 이게 직관적으로 행(인덱스)을 구분하기 쉽다.
# pd.read_csv(data, index_col=?) 인덱스 컬럼 설정하기

train = pd.read_csv("train.csv", index_col = "PassengerId")
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [34]:
# 똑같은 결과지만 DF 객체에 set_index 메서드를 활용해도 됌
train = pd.read_csv("train.csv")

train = train.set_index("PassengerId")

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
train.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [35]:
# DF관련 속성 및 메서드 index, columns, values, head()
train.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [37]:
train.values

array([[0, 3, 'Braund, Mr. Owen Harris', ..., 7.25, nan, 'S'],
       [1, 1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', ...,
        71.2833, 'C85', 'C'],
       [1, 3, 'Heikkinen, Miss. Laina', ..., 7.925, nan, 'S'],
       ...,
       [0, 3, 'Johnston, Miss. Catherine Helen "Carrie"', ..., 23.45,
        nan, 'S'],
       [1, 1, 'Behr, Mr. Karl Howell', ..., 30.0, 'C148', 'C'],
       [0, 3, 'Dooley, Mr. Patrick', ..., 7.75, nan, 'Q']], dtype=object)

In [42]:
# 컬럼 이름도 바꿀 수 있다.
# df.colums = ?
# df.rename(colums_mapping, axis)


col = ['생존', '등급', '이름', '성별', '나이', '형제', '부모', '티켓', '운임', '객실', '승선지']

train.columns = col
train.head()

Unnamed: 0_level_0,생존,등급,이름,성별,나이,형제,부모,티켓,운임,객실,승선지
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
train = train.rename(columns={"생존":"survived"})
train.head()

Unnamed: 0_level_0,survived,등급,이름,성별,나이,형제,부모,티켓,운임,객실,승선지
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Pandas의 기본연산

In [46]:
train["Survived"]

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [48]:
train['Fare'].mean()

32.2042079685746

In [49]:
train['Fare'].min()

0.0

In [50]:
train['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [None]:
data['Pclass'].head()

In [53]:
# 여기서 Pclass의 종류가 몇개나 되는지 알고싶으면 unique() 함수를 쓰자.
train["Pclass"].unique()

array([3, 1, 2], dtype=int64)

In [54]:
# 그럼 Pclass의 카테고리 갯수를 세어보고 싶다.
# 생소하나 요긴한 value_counts()
train["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [55]:
# 그럼 전체에서 차지하는 비율은 얼마나 될까를 따져보는 것
# value_counts(normalize)
train["Pclass"].value_counts(normalize=True)

3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64

# Pandas 기본연산2

In [57]:
train["Pclass"].head()
# replace, 여기서 3, 2, 1을 각각 이코노미, 비지니스, 퍼스트로 바꾸자

train["Pclass"] = train["Pclass"].replace(3, "이코노미").replace(2, "비지니스").replace(1,"퍼스트")

train["Pclass"].unique()

array(['이코노미', '퍼스트', '비지니스'], dtype=object)

In [59]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# 하지만 위에서는 변형된 결과를 기존 메모리에 할당하지 않는다
# 기존 데이터에 덮어쓰기 위해서는 분명히 명시해줘야한다.

In [None]:
data.head(10)

# Column 접근

### 행렬 검색하기 (행렬 슬라이싱)

In [60]:
# 일반 슬라이싱 방법처럼, [] 안에 컬럼명을 넣는다.
# 단, 복수의 컬럼을 슬라이싱 할 시 []안에 리스트형태로 전달
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [63]:
train[["Survived", "Pclass"]]

Unnamed: 0,Survived,Pclass
0,0,이코노미
1,1,퍼스트
2,1,이코노미
3,1,퍼스트
4,0,이코노미
...,...,...
886,0,비지니스
887,1,퍼스트
888,0,이코노미
889,1,퍼스트


# Row 접근

In [64]:
# df.loc[index]
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 행렬을 동시에 접근하는 방법

In [66]:
# df.loc[index, col]

train.loc[1:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [69]:
train.loc[1, ["Name", "Age"]]

Name    Cumings, Mrs. John Bradley (Florence Briggs Th...
Age                                                    38
Name: 1, dtype: object

# Pandas 마스크

## 마스크

마스크처럼 필요한 부분만 가져오고 나머지는 가린다.

```python
data[데이터 조건]
```

In [None]:
# 퍼스트 클래스인 데이터만

In [71]:
train[train["Pclass"] == "퍼스트"]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,퍼스트,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,퍼스트,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
23,24,1,퍼스트,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,퍼스트,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,퍼스트,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,퍼스트,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,퍼스트,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [73]:
# fare가 100불 이상인 데이터만
train[train["Fare"]>=100]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
27,28,0,퍼스트,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
31,32,1,퍼스트,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
88,89,1,퍼스트,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S
118,119,0,퍼스트,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C
195,196,1,퍼스트,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C
215,216,1,퍼스트,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C
258,259,1,퍼스트,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
268,269,1,퍼스트,"Graham, Mrs. William Thompson (Edith Junkins)",female,58.0,0,1,PC 17582,153.4625,C125,S
269,270,1,퍼스트,"Bissette, Miss. Amelia",female,35.0,0,0,PC 17760,135.6333,C99,S
297,298,0,퍼스트,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S


# Pandas 마스크 응용

In [74]:
data.head()

NameError: name 'data' is not defined

In [75]:
normal = ['이코노미', '비지니스']

normal

['이코노미', '비지니스']

isin()함수를 이용해서 인덱싱(마스크) 할 수 있다.

In [78]:
# isin(normal)
train["Pclass"].isin(normal)


0       True
1      False
2       True
3      False
4       True
       ...  
886     True
887    False
888     True
889    False
890     True
Name: Pclass, Length: 891, dtype: bool

In [77]:
train["Pclass"].isin(normal)
train[train["Pclass"].isin(normal)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,이코노미,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,이코노미,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,이코노미,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,이코노미,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,비지니스,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,이코노미,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


비어있는 null 값을 찾아내보자.

In [80]:
# isnull(), notnull()
train[train['Cabin'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,이코노미,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,이코노미,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,이코노미,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,이코노미,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,비지니스,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,이코노미,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [81]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Pandas Indexing  다중조건 (and, &)

예를들어, 남성이면서 65세 이상인 사람을 찾는 경우

In [82]:
# pandas 에서는 python의 and가 먹히지 않고, 다른 문법을 써야한다.

train['Sex'] == "male"

train["Age"] >= 65

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [83]:
train[(train['Sex'] == "male") & (train["Age"] >= 65)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
33,34,0,비지니스,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S
54,55,0,퍼스트,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C
96,97,0,퍼스트,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,이코노미,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
280,281,0,이코노미,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q
456,457,0,퍼스트,"Millet, Mr. Francis Davis",male,65.0,0,0,13509,26.55,E38,S
493,494,0,퍼스트,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
630,631,1,퍼스트,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
672,673,0,비지니스,"Mitchell, Mr. Henry Michael",male,70.0,0,0,C.A. 24580,10.5,,S
745,746,0,퍼스트,"Crosby, Capt. Edward Gifford",male,70.0,1,1,WE/P 5735,71.0,B22,S


위 방법은 잘 먹히지만, 코드가 길어진다는 단점이 있다.

이런 단점을 보완하기 위해 조건을 변수로 지정해서 가독성을 높인다.

In [85]:
male = train['Sex'] =='male'
elderly = train['Age'] >= 65

In [86]:
train[male&elderly]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
33,34,0,비지니스,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S
54,55,0,퍼스트,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C
96,97,0,퍼스트,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,이코노미,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
280,281,0,이코노미,"Duane, Mr. Frank",male,65.0,0,0,336439,7.75,,Q
456,457,0,퍼스트,"Millet, Mr. Francis Davis",male,65.0,0,0,13509,26.55,E38,S
493,494,0,퍼스트,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
630,631,1,퍼스트,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
672,673,0,비지니스,"Mitchell, Mr. Henry Michael",male,70.0,0,0,C.A. 24580,10.5,,S
745,746,0,퍼스트,"Crosby, Capt. Edward Gifford",male,70.0,1,1,WE/P 5735,71.0,B22,S


# Pandas Indexing 다중조건2 (or, |)

In [87]:
train[male | elderly]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,이코노미,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,퍼스트,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,이코노미,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,비지니스,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,이코노미,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,비지니스,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,퍼스트,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Row를 인덱싱한 다음 컬럼에 접근하기

In [88]:
# 인덱싱 한 그 객체도 여전히 데이터 프레임이다. 바로 컬럼에 접근 가능

train[male & elderly]['Fare']

33     10.5000
54     61.9792
96     34.6542
116     7.7500
280     7.7500
456    26.5500
493    49.5042
630    30.0000
672    10.5000
745    71.0000
851     7.7750
Name: Fare, dtype: float64

위 방법은 pandas에서 권장되지 않는 방법이다. 콤마로 구분하자.

In [89]:
train.loc[(male & elderly), 'Fare']

33     10.5000
54     61.9792
96     34.6542
116     7.7500
280     7.7500
456    26.5500
493    49.5042
630    30.0000
672    10.5000
745    71.0000
851     7.7750
Name: Fare, dtype: float64

# 컬럼 추가, 수정하기

추가하는 문법은 
수정하는 문법과 일치한다.

In [91]:
# 모든 내용이 같은 경우
train['출신지'] = '미국'
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,미국
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,미국
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,미국
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,미국
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,미국


In [92]:
# 내용을 수정하는 경우

train['출신지'] = '캐나다'
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,캐나다
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,캐나다
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,캐나다
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,캐나다
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,캐나다


Pandas는 갯수만 맞춰주면 알아서 넣어준다.

In [93]:
train.shape

(891, 13)

In [94]:
train['order'] = [i for i in range(891)]
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지,order
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,캐나다,0
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,캐나다,1
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,캐나다,2
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,캐나다,3
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,캐나다,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,비지니스,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,캐나다,886
887,888,1,퍼스트,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,캐나다,887
888,889,0,이코노미,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,캐나다,888
889,890,1,퍼스트,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,캐나다,889


# 컬럼 추가, 수정2

In [113]:
# Fare가 $200 이상, Pclass가 퍼스트급인 사람은 VIP로 하자.

rich = train["Fare"] >= 200
first = train["Pclass"] == "퍼스트"

train.loc[rich&first, "VIP"] = True

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지,order,VIP
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,캐나다,0,
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,캐나다,1,
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,캐나다,2,
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,캐나다,3,
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,캐나다,4,


# 직접해보기

VIP인 사람만 추출하세요.

In [116]:
# write your code here!
train[train["VIP"] == True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지,order,VIP
27,28,0,퍼스트,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S,캐나다,27,True
88,89,1,퍼스트,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S,캐나다,88,True
118,119,0,퍼스트,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C,캐나다,118,True
258,259,1,퍼스트,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C,캐나다,258,True
299,300,1,퍼스트,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5208,B58 B60,C,캐나다,299,True
311,312,1,퍼스트,"Ryerson, Miss. Emily Borie",female,18.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C,캐나다,311,True
341,342,1,퍼스트,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S,캐나다,341,True
377,378,0,퍼스트,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C,캐나다,377,True
380,381,1,퍼스트,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,C,캐나다,380,True
438,439,0,퍼스트,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S,캐나다,438,True


# 직접해보기
VIP인 사람들의 이름을 추출하세요.

In [120]:
# write your code here!
train.loc[train["VIP"]==True, "Name"]

27                        Fortune, Mr. Charles Alexander
88                            Fortune, Miss. Mabel Helen
118                             Baxter, Mr. Quigg Edmond
258                                     Ward, Miss. Anna
299      Baxter, Mrs. James (Helene DeLaudeniere Chaput)
311                           Ryerson, Miss. Emily Borie
341                       Fortune, Miss. Alice Elizabeth
377                            Widener, Mr. Harry Elkins
380                                Bidois, Miss. Rosalie
438                                    Fortune, Mr. Mark
527                                   Farthing, Mr. John
557                                  Robbins, Mr. Victor
679                   Cardeza, Mr. Thomas Drake Martinez
689                    Madill, Miss. Georgette Alexandra
700    Astor, Mrs. John Jacob (Madeleine Talmadge Force)
716                        Endres, Miss. Caroline Louise
730                        Allen, Miss. Elisabeth Walton
737                            

## VIP 컬럼의 값을 bool값이 아니라, VIP, Non-VIP로 맵핑하자.

# 컬럼 삭제

df.drop(col, axis)

In [122]:
train = train.drop("VIP", axis = 1)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지,order
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,캐나다,0
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,캐나다,1
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,캐나다,2
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,캐나다,3
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,캐나다,4


In [None]:
# Pandas는 데이터 보호를 위해, drop만으로 원본 데이터를 드랍하지 않는다.


# Pandas apply, 파이썬 함수를 적용할 수 있게 하는 함수

이것을 이용해서, 퍼스트 클래스에 탄 사람이라면 모두 VIP로 바꿔보자.

In [None]:
def is_vip(row):
    fare = row["Fare"]
    pclass = row["Pclass"]
    
    if fare>200 and pclass == "퍼스트":
        return True
    else:
        return False

train["VIP"] = train.apply(is_vip, axis=1)

In [123]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,출신지,order
0,1,0,이코노미,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,캐나다,0
1,2,1,퍼스트,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,캐나다,1
2,3,1,이코노미,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,캐나다,2
3,4,1,퍼스트,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,캐나다,3
4,5,0,이코노미,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,캐나다,4


### 복수조건이 들어갈 때 apply 구현 방법

퍼스트에 탄 여성만을 VIP로 바꾸어보자.