# pandas 데이터 파악과 조작

**분석할 데이터를 수집(확보)하면 데이터의 특징을 파악하고 다루기 쉽게 변형하는 작업을 수행해야 한다**

# #2. 데이터 조작(가공)

- 데이터 개수 세기 : count(), value_counts()
- 데이터 정렬 : sort_values(), sort_index()
- 데이터 집계 : 합계(sum()), 평균(mean()), 최대(max()), 최소(min())
- 데이터 삭제 : drop(axis=0/1)
- 결측치 처리 : dropna(axis=0/1, subset, inplace)
- 데이터 변경 : 
    - 자료형 변경 : astype()
    - 수치형 데이터를 범주형 데이터로 변경 : 
        - 구간을 지정하여 범주화 : cut(data, bins, labels)
        - 동일한 개수를 갖도록 범주화 : qcut(data, bins_num, labels)
- 행/열에 동일한 함수 적용 : apply()
- 열 재구성 : DataFrame.str.split(), Series.str.get()
- 필터링 : isin()

---

In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [3]:
import warnings
warnings.filterwarnings('ignore')

# 8. 열 재구성

### 열 순서 변경

- 데이터프레임의 열 순서 변경
- 형식 : df[재구성한 열이름리스트]

In [4]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
df = titanic.loc[0:4, 'survived':'age']
df

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [9]:
col = df.columns
col = list(col)
col

['survived', 'pclass', 'sex', 'age']

In [10]:
col_sorted = sorted(col)
col_sorted

['age', 'pclass', 'sex', 'survived']

In [11]:
df[col_sorted]

Unnamed: 0,age,pclass,sex,survived
0,22.0,3,male,0
1,38.0,1,female,1
2,26.0,3,female,1
3,35.0,1,female,1
4,35.0,3,male,0


In [12]:
df[reversed(col)]

Unnamed: 0,age,sex,pclass,survived
0,22.0,male,3,0
1,38.0,female,1,1
2,26.0,female,3,1
3,35.0,female,1,1
4,35.0,male,3,0


### 열 분리

In [13]:
# 넷플렉스 주가 데이터
# https://www.kaggle.com/datasets/pritsheta/netflix-stock-data-from-2002-to-2021?resource=download

df = pd.read_csv('data/NFLX.csv')
df.head()
df.info()
df.shape

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2002-05-23,1.156429,1.242857,1.145714,1.196429,1.196429,104790000
1,2002-05-24,1.214286,1.225,1.197143,1.21,1.21,11104800
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400
3,2002-05-29,1.164286,1.164286,1.085714,1.103571,1.103571,6757800
4,2002-05-30,1.107857,1.107857,1.071429,1.071429,1.071429,10154200


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874 entries, 0 to 4873
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4874 non-null   object 
 1   Open       4874 non-null   float64
 2   High       4874 non-null   float64
 3   Low        4874 non-null   float64
 4   Close      4874 non-null   float64
 5   Adj Close  4874 non-null   float64
 6   Volume     4874 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 266.7+ KB


(4874, 7)

#### Date열의 연-월-일을 ['year','month','day']로 분리

In [14]:
dates = df.Date.str.split('-')
dates

0       [2002, 05, 23]
1       [2002, 05, 24]
2       [2002, 05, 28]
3       [2002, 05, 29]
4       [2002, 05, 30]
             ...      
4869    [2021, 09, 24]
4870    [2021, 09, 27]
4871    [2021, 09, 28]
4872    [2021, 09, 29]
4873    [2021, 09, 30]
Name: Date, Length: 4874, dtype: object

#### 시리즈의 문자열 리스트 인덱싱 : Series.str.get(인덱스)

In [15]:
dates.str.get(0)

0       2002
1       2002
2       2002
3       2002
4       2002
        ... 
4869    2021
4870    2021
4871    2021
4872    2021
4873    2021
Name: Date, Length: 4874, dtype: object

#### ['year','month','day']로 분리된 시리즈문자열 요소를 데이터프레임 열로 추가

In [19]:
df['year'] = dates.str.get(0)
df['month'] = dates.str.get(1)
df['day'] = dates.str.get(2)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,year,month,day
0,2002-05-23,1.156429,1.242857,1.145714,1.196429,1.196429,104790000,2002,5,23
1,2002-05-24,1.214286,1.225,1.197143,1.21,1.21,11104800,2002,5,24
2,2002-05-28,1.213571,1.232143,1.157143,1.157143,1.157143,6609400,2002,5,28
3,2002-05-29,1.164286,1.164286,1.085714,1.103571,1.103571,6757800,2002,5,29
4,2002-05-30,1.107857,1.107857,1.071429,1.071429,1.071429,10154200,2002,5,30


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874 entries, 0 to 4873
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4874 non-null   object 
 1   Open       4874 non-null   float64
 2   High       4874 non-null   float64
 3   Low        4874 non-null   float64
 4   Close      4874 non-null   float64
 5   Adj Close  4874 non-null   float64
 6   Volume     4874 non-null   int64  
 7   year       4874 non-null   object 
 8   month      4874 non-null   object 
 9   day        4874 non-null   object 
dtypes: float64(5), int64(1), object(4)
memory usage: 380.9+ KB


## 9. 필터링

### 불린 인덱싱(boolean indexing)

In [20]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


#### 나이가 10대(10~19세) 승객 추출

In [29]:
mask = (titanic.age >=10) & (titanic.age <=19)
titanic[mask].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True
22,1,3,female,15.0,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
38,0,3,female,18.0,2,0,18.0,S,Third,woman,False,,Southampton,no,False


#### 나이가 10세 미만이고 여성인 승객 추출

In [31]:
mask2 = (titanic.age < 10) & (titanic.sex == 'female')
titanic[mask2].head()
titanic.loc[mask2].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
24,0,3,female,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
58,1,2,female,5.0,1,2,27.75,S,Second,child,False,,Southampton,yes,False
119,0,3,female,2.0,4,2,31.275,S,Third,child,False,,Southampton,no,False


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
24,0,3,female,8.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
58,1,2,female,5.0,1,2,27.75,S,Second,child,False,,Southampton,yes,False
119,0,3,female,2.0,4,2,31.275,S,Third,child,False,,Southampton,no,False


#### 나이가 10세 미만 또는 60세 이상인 승객의 age, sex, alone 열 추출

In [33]:
mask3 = [(titanic.age < 10) | (titanic.age >= 60)]
col = ['age', 'sex', 'alone']
titanic.loc[mask, col].head()

Unnamed: 0,age,sex,alone
9,14.0,female,False
14,14.0,female,True
22,15.0,female,True
27,19.0,male,False
38,18.0,female,False


### isin() 메서드 활용

- isin() 메서드를 적용하면 특정 값을 가진 행들을 추출함
- 형식 : df.isin(추출값 리스트)

In [34]:
# IPython 디스플레이 설정 변경 : 출력할 최대 열의 개수
pd.set_option('display.max_columns', 10)

In [35]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,...,True,,Southampton,no,False
1,1,1,female,38.0,1,...,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,...,False,,Southampton,yes,True
3,1,1,female,35.0,1,...,False,C,Southampton,yes,False
4,0,3,male,35.0,0,...,True,,Southampton,no,True


#### 함께 탑승한 형제 또는 배우자 수가 3,4,5인 승객만 추출

- 불린 인덱싱 사용

In [36]:
mask1 = titanic.sibsp == 3
mask2 = titanic.sibsp == 4
mask3 = titanic.sibsp == 5
titanic[mask1 | mask2 | mask3]

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False
59,0,3,male,11.0,5,...,False,,Southampton,no,False
63,0,3,male,4.0,3,...,False,,Southampton,no,False
68,1,3,female,17.0,4,...,False,,Southampton,yes,False
71,0,3,female,16.0,5,...,False,,Southampton,no,False
85,1,3,female,33.0,3,...,False,,Southampton,yes,False


- isin() 메서드 사용

In [38]:
titanic[titanic.sibsp.isin([3,4,5])]

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False
59,0,3,male,11.0,5,...,False,,Southampton,no,False
63,0,3,male,4.0,3,...,False,,Southampton,no,False
68,1,3,female,17.0,4,...,False,,Southampton,yes,False
71,0,3,female,16.0,5,...,False,,Southampton,no,False
85,1,3,female,33.0,3,...,False,,Southampton,yes,False


-----------------------------------------