In [209]:
import pandas as pd

* csv파일의 특정 열을 지정하여 데이터 프레임생성

In [210]:
movies = pd.read_csv('data/movies.csv', usecols = ['Film','Genre','Year'])
movies.head()

Unnamed: 0,Film,Genre,Year
0,Zack and Miri Make a Porno,Romance,2008
1,Youth in Revolt,Comedy,2010
2,You Will Meet a Tall Dark Stranger,Comedy,2010
3,When in Rome,Comedy,2010
4,What Happens in Vegas,Comedy,2008


In [211]:
movies_all = pd.read_csv('data/movies.csv')
movies_all.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


# 브로드캐스팅을 활용하여 특정 열 생성 미 초기화

* Audience score, Rotten Tomatoes는 영화 평가 지표임  
has_seen이라는 열을 만들어 두 지표의 평균값을 구하기 위해 열 생성 및 초기화 시도

In [212]:
movies['has_seen'] = 0 # 신규열 초기화 하는 방법
movies.head()

Unnamed: 0,Film,Genre,Year,has_seen
0,Zack and Miri Make a Porno,Romance,2008,0
1,Youth in Revolt,Comedy,2010,0
2,You Will Meet a Tall Dark Stranger,Comedy,2010,0
3,When in Rome,Comedy,2010,0
4,What Happens in Vegas,Comedy,2008,0


In [213]:
movies_all.columns

Index(['Film', 'Genre', 'Lead Studio', 'Audience score %', 'Profitability',
       'Rotten Tomatoes %', 'Worldwide Gross', 'Year'],
      dtype='object')

In [214]:
movies['has_seen'] = (movies_all['Audience score %'] + movies_all['Rotten Tomatoes %']) // 2
movies.head()

Unnamed: 0,Film,Genre,Year,has_seen
0,Zack and Miri Make a Porno,Romance,2008,67
1,Youth in Revolt,Comedy,2010,60
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39
3,When in Rome,Comedy,2010,29
4,What Happens in Vegas,Comedy,2008,50


# 타입변경

In [215]:
movies_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Film               77 non-null     object 
 1   Genre              77 non-null     object 
 2   Lead Studio        77 non-null     object 
 3   Audience score %   77 non-null     int64  
 4   Profitability      77 non-null     float64
 5   Rotten Tomatoes %  77 non-null     int64  
 6   Worldwide Gross    77 non-null     object 
 7   Year               77 non-null     int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 4.9+ KB


In [216]:
movies_all.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


* 숫자로 변환가능한 문자열은 판다스에서 자동으로 숫자로 변환해 준다.
* csv 파일은 아스키 파일이기 때문에 서식이 들어간 숫자 데이터를 문자열로 인식한다. 예) Worldwide Gross
* 서식이 있는 숫자 데이터는 서식을 제거하고 별도로 형변환을 해주어야 한다. => 이러한 작업을 전처리(Preprocessing)이라고 한다.
* 열의 타입이 문자열인 경우. 열.str 속성의 문자열 함수를 사용할 수 있다.

# 타입변경을 위한 기본 전처리

* 전처리(Preprocessing): 데이터 분석을 하기 위해서 원본 데이터를 분석 가능한 데이터로 조작하는 작업

In [217]:
movies_all['Worldwide Gross']

0      $41.94 
1      $19.62 
2      $26.66 
3      $43.04 
4     $219.37 
        ...   
72     $29.37 
73     $30.68 
74      $8.97 
75    $160.31 
76     $60.72 
Name: Worldwide Gross, Length: 77, dtype: object

* 데이터프레임['타입이 문자열인 열 이름'].str.replace([원본문자열],[바꿀문자열])
    * 문자열을 포함한 열의 모든 행데이터값에 대해서 replace 수행
    * 변경된 값을 반환한다. 원본이 바뀌는 것은 X

In [218]:
movies_all['Worldwide Gross'].str.replace('$','').head(3) 

0    41.94 
1    19.62 
2    26.66 
Name: Worldwide Gross, dtype: object

In [219]:
movies_all['Worldwide Gross'].head(3)

0    $41.94 
1    $19.62 
2    $26.66 
Name: Worldwide Gross, dtype: object

In [220]:
wg = movies_all['Worldwide Gross'].str.replace('$','')
wg.head(3) # 여전히 타입은 문자열

0    41.94 
1    19.62 
2    26.66 
Name: Worldwide Gross, dtype: object

In [221]:
# 아래 코드 수행하면 TypeError 발생
# wg * 1392.20 # 보이기에는 실수 타입으로 보이나 실제는 문자열 타입이기때문에 수치형 연산이 불가능하다.

* astype: 형변환 함수

In [222]:
wg.astype('float64')

0      41.94
1      19.62
2      26.66
3      43.04
4     219.37
       ...  
72     29.37
73     30.68
74      8.97
75    160.31
76     60.72
Name: Worldwide Gross, Length: 77, dtype: float64

* 메소드 파이프라인  
  앞의 메소드의 수행 결과를 연속적으로 사용하는 프로그래밍 방식

In [223]:
movies_all['Worldwide Gross'].str.replace('$','').astype('float64')

0      41.94
1      19.62
2      26.66
3      43.04
4     219.37
       ...  
72     29.37
73     30.68
74      8.97
75    160.31
76     60.72
Name: Worldwide Gross, Length: 77, dtype: float64

In [224]:
converted_gross = movies_all['Worldwide Gross'].str.replace('$','').astype('float64') * 1392.20

# 전처리한 결과를 특정 위치에 삽입

* insert() : 데이터프레임.insert(삽입열의 인덱스, 열이름, 값) => 함수 수행후 데이터 프레임의 값을 갱신한다.

* 열의 인덱스 확인

In [225]:
movies.columns

Index(['Film', 'Genre', 'Year', 'has_seen'], dtype='object')

* get_loc([열이름])  
특정열의 인덱스를 반환한다.

In [226]:
movies.columns.get_loc('has_seen') # get_loc([열이름]) 해당열의 인덱스 반환

3

In [227]:
try:
    movies.insert( 4, '매출(한화)',  converted_gross ) # 두번 실행하면 동일한 열이 추가되기 때문에 런타임에러 발생한다.
except:
    print('동일한 열이 있으므로 삽입할 수 없습니다.')

In [228]:
movies

Unnamed: 0,Film,Genre,Year,has_seen,매출(한화)
0,Zack and Miri Make a Porno,Romance,2008,67,58388.868
1,Youth in Revolt,Comedy,2010,60,27314.964
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39,37116.052
3,When in Rome,Comedy,2010,29,59920.288
4,What Happens in Vegas,Comedy,2008,50,305406.914
...,...,...,...,...,...
72,Across the Universe,romance,2007,69,40888.914
73,A Serious Man,Drama,2009,76,42712.696
74,A Dangerous Method,Drama,2011,84,12488.034
75,27 Dresses,Comedy,2008,55,223183.582


* 열 이름으로 열 삭제

In [229]:
movies.columns

Index(['Film', 'Genre', 'Year', 'has_seen', '매출(한화)'], dtype='object')

In [230]:
movies.drop( columns=['매출(한화)']).head(3)

Unnamed: 0,Film,Genre,Year,has_seen
0,Zack and Miri Make a Porno,Romance,2008,67
1,Youth in Revolt,Comedy,2010,60
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39


In [231]:
try:
    movies.drop( columns=['매출(한화)'], inplace = True)
except:
    print("해당열이 존재하지 않아 삭제가 불가능합니다.")
movies.head(3)

Unnamed: 0,Film,Genre,Year,has_seen
0,Zack and Miri Make a Porno,Romance,2008,67
1,Youth in Revolt,Comedy,2010,60
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39


* 특정 열 다음에 신규열 추가하기  
    * Film 열에 '매출(한화)' 열 추가  
    insert, get_loc 함수를 조합해서 할 수 있다.

In [232]:
movies.insert( movies.columns.get_loc('Film') + 1 , '매출(한화)', converted_gross )

In [233]:
movies.head()

Unnamed: 0,Film,매출(한화),Genre,Year,has_seen
0,Zack and Miri Make a Porno,58388.868,Romance,2008,67
1,Youth in Revolt,27314.964,Comedy,2010,60
2,You Will Meet a Tall Dark Stranger,37116.052,Comedy,2010,39
3,When in Rome,59920.288,Comedy,2010,29
4,What Happens in Vegas,305406.914,Comedy,2008,50


* 문제 풀어봐요  
'매출(미화)' 열을 movies 데이터 프레임 제일 마지막 열에 추가하세요.

In [234]:
movies.drop( columns=['매출(한화)'], inplace=True)
movies

Unnamed: 0,Film,Genre,Year,has_seen
0,Zack and Miri Make a Porno,Romance,2008,67
1,Youth in Revolt,Comedy,2010,60
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39
3,When in Rome,Comedy,2010,29
4,What Happens in Vegas,Comedy,2008,50
...,...,...,...,...
72,Across the Universe,romance,2007,69
73,A Serious Man,Drama,2009,76
74,A Dangerous Method,Drama,2011,84
75,27 Dresses,Comedy,2008,55


In [245]:
try:
    movies.insert( movies.columns.get_loc('has_seen') + 1 , '매출(한화)', converted_gross )
except:
    print('해당열이 이미 존재합니다.')
movies.head(3)

해당열이 이미 존재합니다.


Unnamed: 0,Film,Genre,Year,has_seen,매출(한화)
0,Zack and Miri Make a Porno,Romance,2008,67,58388.868
1,Youth in Revolt,Comedy,2010,60,27314.964
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39,37116.052


# 특정 원소 변경

### 조회

* 특정 인덱스, 특정 열의 데이터 조회

In [246]:
movies.loc[74, 'has_seen']

np.int64(84)

### 변경

In [247]:
movies.loc[74, 'has_seen'] = 98

In [248]:
movies.loc[74, 'has_seen']

np.int64(98)

In [249]:
movies.head(3)

Unnamed: 0,Film,Genre,Year,has_seen,매출(한화)
0,Zack and Miri Make a Porno,Romance,2008,67,58388.868
1,Youth in Revolt,Comedy,2010,60,27314.964
2,You Will Meet a Tall Dark Stranger,Comedy,2010,39,37116.052


# 데이터 삭제

## 데이터 프레임에서 삭제

In [250]:
movies_del = pd.read_csv('data/movies.csv')
movies_del.head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


## 열 삭제

* 삭제방법1 (axis 인자 활용)

In [254]:
movies_del.drop('Rotten Tomatoes %', axis=1).head(1)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008


In [252]:
movies_del.head(1)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008


* 삭제방법2 (columns 인자 활용)

In [253]:
movies_del.drop(columns = ['Rotten Tomatoes %']).head(1)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008


* 삭제본 만드는 법: copy본 만들기

In [265]:
movies_del_copy = movies_del.drop(columns = ['Rotten Tomatoes %'])
movies_del_copy.head(1)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008


* 삭제본 만드는 법: 자기 자신 변경

In [273]:
try:
    movies_del.drop(columns = ['Rotten Tomatoes %'], inplace=True)
except:
    print("해당열이 존재하지 않아 삭제할 수 없습니다.")
movies_del.head(1)

해당열이 존재하지 않아 삭제할 수 없습니다.


Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008


### 행삭제

* 행인덱스 2,3,4의 멀티행 삭제

* 방법1: index인자 활용

In [279]:
movies_del.drop(index = [2,3,4]).head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
5,Water For Elephants,Drama,20th Century Fox,72,3.081421,$117.09,2011
6,WALL-E,Animation,Disney,89,2.896019,$521.28,2008
7,Waitress,Romance,Independent,67,11.089742,$22.18,2007


* 방법2: axis 인자 활용

In [277]:
movies_del.drop([2,3,4], axis=0).head()

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
5,Water For Elephants,Drama,20th Century Fox,72,3.081421,$117.09,2011
6,WALL-E,Animation,Disney,89,2.896019,$521.28,2008
7,Waitress,Romance,Independent,67,11.089742,$22.18,2007


* 슬라이스 적용  
행인덱스 10~20행 데이터 삭제

In [278]:
movies_del.drop(index=movies.index[10:20], axis=0).head(20)

Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,$219.37,2008
5,Water For Elephants,Drama,20th Century Fox,72,3.081421,$117.09,2011
6,WALL-E,Animation,Disney,89,2.896019,$521.28,2008
7,Waitress,Romance,Independent,67,11.089742,$22.18,2007
8,Waiting For Forever,Romance,Independent,53,0.005,$0.03,2011
9,Valentine's Day,Comedy,Warner Bros.,54,4.184038,$217.57,2010
