# [DataFrame Sorting]
  * 종류 : 인덱스 기반 정렬, 값 기반 정렬

In [9]:
# 1. 모듈 로딩
import pandas as pd

# 2. 데이터 준비
dataFile = "/content/movies.csv"

# 3. DataFrame에 저장
movieDF = pd.read_csv(dataFile)
movieDF

Unnamed: 0,Rank,Title,Studio,Gross,Year
0,1,Avengers: Endgame,Buena Vista,"$2,796.30",2019
1,2,Avatar,Fox,"$2,789.70",2009
2,3,Titanic,Paramount,"$2,187.50",1997
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018
...,...,...,...,...,...
777,778,Yogi Bear,Warner Brothers,$201.60,2010
778,779,Garfield: The Movie,Fox,$200.80,2004
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
780,781,The Hunt for Red October,Paramount,$200.50,1990


In [10]:
# 데이터의 전체적인 구조 및 데이터 종류 요약
movieDF.info()

# - year : type 오류

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rank    782 non-null    int64 
 1   Title   782 non-null    object
 2   Studio  782 non-null    object
 3   Gross   782 non-null    object
 4   Year    782 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 30.7+ KB


In [23]:
# .describe() : 컬럼별 통계적인 데이터 정보 제공
movieDF.describe()

# 참고: count는 정수지만 칼럼에 실수가 있어 실수로 표현

Unnamed: 0,Rank,Year
count,782.0,782.0
mean,391.5,2006.620205
std,225.888247,10.026227
min,1.0,1939.0
25%,196.25,2001.0
50%,391.5,2009.0
75%,586.75,2014.0
max,782.0,2019.0


In [25]:
# 구체적이고 다양한 인덱스까지 추출하기: include=all
movieDF.describe(include='all')
# - unique : 뭐였더라
# - top : 가장 많이 나오는 요소
# - freq : 빈도

Unnamed: 0,Rank,Title,Studio,Gross,Year
count,782.0,782,782,782,782.0
unique,,773,37,701,
top,,Beauty and the Beast,Warner Brothers,$225.90,
freq,,2,132,3,
mean,391.5,,,,2006.620205
std,225.888247,,,,10.026227
min,1.0,,,,1939.0
25%,196.25,,,,2001.0
50%,391.5,,,,2009.0
75%,586.75,,,,2014.0


### ㅇ 수치 칼럼의 최소, 최대, 평균 데이터 추출

In [16]:
movieDes = movieDF.describe()            # 강사님은 twoDF 라고 지음
movieDes.loc[['min', 'max', 'mean']]

Unnamed: 0,Rank,Year
min,1.0,1939.0
max,782.0,2019.0
mean,391.5,2006.620205


### ㅇ 행 열의 인덱스를 바꿔보자 - Year가 행으로

In [26]:
# rank와 year 데이터를 뽑아오기
pd.DataFrame([[movieDF.Rank.values], [movieDF.Year.values]],
             columns=movieDF.index,
             index=movieDF.columns)

ValueError: 782 columns passed, passed data had 1 columns

### ㅇ 하지만 쉽게 바꾸는 메서드가 있음,,

In [20]:
# .T : 행열을 바꾸는 메서드 1
movieDes.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rank,782.0,391.5,225.888247,1.0,196.25,391.5,586.75,782.0
Year,782.0,2006.620205,10.026227,1939.0,2001.0,2009.0,2014.0,2019.0


In [21]:
# .transpose : 행열을 바꾸는 매서드 2
movieDes.transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rank,782.0,391.5,225.888247,1.0,196.25,391.5,586.75,782.0
Year,782.0,2006.620205,10.026227,1939.0,2001.0,2009.0,2014.0,2019.0


ㅇ describe(include=all), info() 보는 법도 설명

### (5) 데이터 정렬

In [29]:
# 인덱스 기반 정렬 => DF.sort_index()
movieDF.sort_index(ascending=False, axis=0) # 내림차순 정렬
# - axis = 'columns' or 'index' : 행 또는 열 기반 정렬
# - inplace=False : 저장X
# - ascending=True : 오름차순

Unnamed: 0,Rank,Title,Studio,Gross,Year
781,782,Valkyrie,MGM,$200.30,2008
780,781,The Hunt for Red October,Paramount,$200.50,1990
779,780,Cats & Dogs,Warner Brothers,$200.70,2001
778,779,Garfield: The Movie,Fox,$200.80,2004
777,778,Yogi Bear,Warner Brothers,$201.60,2010
...,...,...,...,...,...
4,5,Avengers: Infinity War,Buena Vista,"$2,048.40",2018
3,4,Star Wars: The Force Awakens,Buena Vista,"$2,068.20",2015
2,3,Titanic,Paramount,"$2,187.50",1997
1,2,Avatar,Fox,"$2,789.70",2009


In [37]:
# 컬럼 인덱스 기반 정렬 => 내림차순
movieDF.sort_index(axis='columns', ascending=False)

Unnamed: 0,Year,Title,Studio,Rank,Gross
0,2019,Avengers: Endgame,Buena Vista,1,"$2,796.30"
1,2009,Avatar,Fox,2,"$2,789.70"
2,1997,Titanic,Paramount,3,"$2,187.50"
3,2015,Star Wars: The Force Awakens,Buena Vista,4,"$2,068.20"
4,2018,Avengers: Infinity War,Buena Vista,5,"$2,048.40"
...,...,...,...,...,...
777,2010,Yogi Bear,Warner Brothers,778,$201.60
778,2004,Garfield: The Movie,Fox,779,$200.80
779,2001,Cats & Dogs,Warner Brothers,780,$200.70
780,1990,The Hunt for Red October,Paramount,781,$200.50


In [40]:
movieDF.sort_values(by='Year')

# - by : str or list of str
#        Name or list of names to sort by.
#        =>  정렬 기준으로 할 키를 입력 받음


Unnamed: 0,Rank,Title,Studio,Gross,Year
287,288,Gone with the Wind,MGM,$402.40,1939
539,540,Bambi,RKO,$267.40,1942
707,708,101 Dalmatians,Buena Vista,$215.90,1961
754,755,The Jungle Book,Buena Vista,$205.80,1967
603,604,The Godfather,Paramount,$245.10,1972
...,...,...,...,...,...
685,686,Men in Black International,Sony,$220.80,2019
457,458,John Wick: Chapter 3 - Parabellum,Lionsgate,$304.70,2019
262,263,Pokemon Detective Pikachu,Warner Brothers,$427.50,2019
602,603,Dark Phoenix,Fox,$245.10,2019


In [41]:
movieDF.sort_values(by=['Year', 'Title'])

# 두 값을 넣으면 앞의 키를 먼저 보고, 동일하면 뒤의 키를 확인한다.

Unnamed: 0,Rank,Title,Studio,Gross,Year
287,288,Gone with the Wind,MGM,$402.40,1939
539,540,Bambi,RKO,$267.40,1942
707,708,101 Dalmatians,Buena Vista,$215.90,1961
754,755,The Jungle Book,Buena Vista,$205.80,1967
603,604,The Godfather,Paramount,$245.10,1972
...,...,...,...,...,...
339,340,Shazam!,Warner Brothers,$364.10,2019
669,670,The Secret Life of Pets 2,Universal,$225.90,2019
113,114,The Wandering Earth,China Film Corporation,$699.80,2019
197,198,Toy Story 4,Buena Vista,$519.80,2019


### [실습]

In [42]:
import numpy as np
from math import nan                         # nan : Not a number ; 빈칸 의미

data={'col1':['A','A','B',nan,'D','C'],
      'col2':[2,1,9,8,7,4],
      'col3':[0,1,9,4,2,3],
      'col4':['a','B','c','D','e','F'],}

dataDF=pd.DataFrame(data)
dataDF

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


 - col1 컬럼을 기준으로 오름차순 정렬

In [43]:
# NaN : 빈칸 데이터 (결측치); 값의 위치
# - 설정 => na_position 파라미터
# - na_position = 'first' or 'last'
dataDF.sort_values(by='col1', na_position='first')   # 빈칸이 맨 처음으로 옴

Unnamed: 0,col1,col2,col3,col4
3,,8,4,D
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
5,C,4,3,F
4,D,7,2,e


In [45]:
# key 파라미터:
#  - key : key function; 함수 사용 -> 람다 사용 (ex. lambda col: col.str.upper())

dataDF.sort_values(by='col4')  # col4 하면 대소문자가 나뉜다

Unnamed: 0,col1,col2,col3,col4
1,A,1,1,B
3,,8,4,D
5,C,4,3,F
0,A,2,0,a
2,B,9,9,c
4,D,7,2,e


In [46]:
# key= 에서 lambda로 대소문자 구분을 없애기
dataDF.sort_values(by='col4', key= lambda col:col.str.upper())

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


In [None]:
# Fin.