# 데이터 Filtering ( Query )

## 1. 일반적인 방법

 DataFrame [ ] 안에 조건식을 넣는다.

In [8]:
import pandas as pd
pd.options.display.max_rows = 4

train = pd.read_csv("./data/titanic_train.csv")

In [9]:
train['Survived']==0 # train.Survived==0 도 같은 의미

0       True
1      False
       ...  
889    False
890     True
Name: Survived, Length: 891, dtype: bool

In [10]:
train.loc[ train['Survived']==0 ]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


#### 다중 조건 ( & 또는 | 를 사용 )

In [11]:
## 조건마다 ( ) 가 반드시 필요
train.loc[(train.Age>30) & (train.Survived==0)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q


## 2. eval 또는 query를 활용

참고 : https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html

In [1]:
import pandas as pd
pd.options.display.max_rows = 4

train = pd.read_csv("./data/titanic_train.csv")

### eval과 query를 사용하는 이유 - memory 효율적 처리

다중 조건을 사용하는 경우, 일반적 방법은 각각의 결과를 계산 후 마지막 논리 연산을 수행한다.

반면, eval과 query는 row마다 한 번에 모든 조건을 검사해 메모리를 효율적으로 사용한다고 한다.

조건식이 복잡하거나 데이터카 클수록 효과적이다.

일반적으로, 10,000 row 이상 데이터에 사용을 추천함

#### eval의 performance를 높이기 위해 **numexpr** 설치를 추천 (설치 시 자동 적용)

참고 : [Enhancing performance - eval](https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval)

### eval

In [7]:
pd.eval('train.Survived == 0 and train.Pclass == 3')
# train.eval('Survived == 0 and Pclass == 3')과 같다.

0       True
1      False
       ...  
889    False
890     True
Length: 891, dtype: bool

eval과 일반적 방법 차이

In [None]:
# 일반적
train[(train.Survived) == 0 & (train.Pclass == 3)]
# eval
train[train.eval('Survived == 0 and Pclass == 3')]

### query

기본적으로 df.query == df[df.eval] 형태이다.

In [12]:
train.query('Survived == 0')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


#### 변수 값을 query 안에 넣어서 활용

In [13]:
survived = 1
train.query('Survived == @survived')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### 다중 조건 - and나 or 도 사용 가능 / ( ) 없이 사용 가능

In [14]:
train.query('Age>30 and Survived==0')
# train.query('Age>30 & Survived==0')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q


#### 추가 기능

#### - in, not in

In [29]:
check_list = ['S','Q']
train.query('Embarked in @check_list') # train['Embarked'].isin(['S','Q'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.250,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.100,C123,S
...,...,...,...,...,...,...,...,...,...,...,...,...
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.450,,S
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.750,,Q


#### - index 자체를  조건으로 활용

In [30]:
train.query('index > 100') # index 101이상부터 출력

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
101,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
102,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.7500,,Q


multi index의 경우도 가능

In [15]:
multi_index_train = train[:]
multi_index_train.set_index(['Sex','Pclass'], inplace=True)

In [16]:
multi_index_train.query('Sex=="male" and Pclass==3')

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
male,3,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S
male,3,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S
male,...,...,...,...,...,...,...,...,...,...,...
male,3,885,0,"Sutehall, Mr. Henry Jr",25.0,0,0,SOTON/OQ 392076,7.05,,S
male,3,891,0,"Dooley, Mr. Patrick",32.0,0,0,370376,7.75,,Q


## 3. numpy 최적화

### .values 사용

조건식에 .values만 추가해도 ndarray형태가 되어 빠른 연산이 가능하다.

In [6]:
%timeit train.loc[(train.Age>30) & (train.Survived==0)]

1.14 ms ± 5.94 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
%timeit train.loc[(train.Age.values>30) & (train.Survived.values==0)]
# NaN values로 인한 warning 메세지

  """Entry point for launching an IPython kernel.


403 µs ± 537 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### .isin

In [25]:
import numpy as np
check_list = np.array(['S','Q'])

In [24]:
%timeit train.query('Embarked in @check_list')

1.53 ms ± 5.95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%timeit train[np.isin(train['Embarked'].values, check_list)]

476 µs ± 5.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
