# Import needed library

In [55]:
import pandas as pd 
import numpy as np 
import tmdbv3api as tmdb 

# Data Gathering
- Dữ liệu thu thập tới từ list film của Anh, Mỹ, Nhật, Hàn, Trung quốc, Việt Nam, Nga, Pháp, Đức
- Tên phim theo các năm được lấy từ wikipedia, dữ liệu từ APi tmdb
- Thời gian từ 2020s - 1990s 

In [56]:
from tmdbv3api import TMDb, Movie, Search
from os import getenv
tmdb = TMDb()

tmdb.api_key = getenv('TMDB_API_KEY')
search = Search()

## Korean 

- Dữ liệu dạng bảng từ 2007 - 2019 
- Dữ liệu từ 2007 - 2008: Cột en-title trùng với kor-title 

In [57]:
from typing import List
pre_link = 'https://en.wikipedia.org/wiki/List_of_South_Korean_films_of_'



def title_handle(lst: List): 
    res = ''
    for i in lst[:-1]: 
        res += i  + ' '

    return res
"""
- Hàm lấy dữ liệu film từ wiki : 
- Hàm trả về en-title + năm của phim 
- Đầu vào là năm  
"""

def get_films_on_wiki(year: int): 
    link = pre_link + str(year)

    page_tables = pd.read_html(link, header= None)

    sub_df = page_tables[2]
    
    # xử lý title của film
    if year == 2007 or year == 2008: 
        sub_df['English titles'] = sub_df['English/Korean Title'].str.split(r'([a-zA-Z]+)')
        sub_df['English title'] = sub_df['English titles'].apply(lambda x: title_handle(x))

    res = sub_df[['English title', 'Released']].copy()
    res.dropna(inplace=True)
    return res


    

In [62]:
dct = {
    'january' : 1, 
    'february' : 2,
    'march' : 3, 
    'april' :  4,
    'may' : 5, 
    'june' : 6, 
    'july' : 7, 
    'august' : 8, 
    'september' : 9, 
    'october' : 10, 
    'november' : 11, 
    'december' : 12
}
# thay đổi year 
def change_year_type(x,  year: int): 
    num_month = dct[x[1]]
    return str(year)  + '-' + str(num_month) + '-' + x[0]

# thay đổi cột released trong bảng 
def change_release_year(df: pd.DataFrame, year: int): 
    df['Released'] = df['Released'].str.lower()

    if year < 2017:  df[['Day', 'Month', 'Ex']] = df['Released'].str.extract(r'(\d+)\s+(\w+)(\[.*\])*')
    else : 
        df[['Month', 'Day', 'Ex']] = df['Released'].str.extract(r'(\w+)\s+(\d+)(\[.*\])*')

    df['Day'] = df['Day'].astype('str')
    df['Released'] = df[['Day', 'Month']].apply(lambda x: change_year_type(x, year), axis = 1)
    df['Released'] = pd.to_datetime(df['Released'])
    df.drop(['Day', 'Month', 'Ex'], axis = 1, inplace = True)
    
    return df

# thêm thuộc tính id vào trong table 
def get_tmdb_id(title: str, release_date): 
    t  = release_date.year
    movie = search.movies(title, release_year=t)

    for j in movie['results']: 
        if j['release_date'] == release_date.strftime('%Y-%m-%d'): 
            return j['id']
        
    return None



In [70]:
data_set = pd.DataFrame(columns=['English title', 'Released', 'TMDB_id'])


for i in range(2007, 2020): 
    print(f"Processing year {i}")
    raw_data = get_films_on_wiki(i)
    data_clean = change_release_year(raw_data, i)
    data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1) 
    data_set = pd.concat([data_set, data_clean])

data_set.head()


Processing year 2007


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)
  data_set = pd.concat([data_set, data_clean])


Processing year 2008


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2009


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2010


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2011


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2012


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2013


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2014


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2015


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2016


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2017


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2018


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Processing year 2019


  num_month = dct[x[1]]
  return str(year)  + '-' + str(num_month) + '-' + x[0]
  data_clean['TMDB_id'] = data_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0],  x[1]), axis = 1)


Unnamed: 0,English title,Released,TMDB_id
0,Attack on the Pin - Up Boys,2007-07-26,75100.0
1,The Bank Attack,2007-11-14,482571.0
2,A Battle of Wits,2007-01-10,
3,Beautiful Sunday,2007-03-29,55753.0
4,Before the Summer Passes Away,2007-01-25,467278.0


In [72]:
data_set

Unnamed: 0,English title,Released,TMDB_id
0,Attack on the Pin - Up Boys,2007-07-26,75100.0
1,The Bank Attack,2007-11-14,482571.0
2,A Battle of Wits,2007-01-10,
3,Beautiful Sunday,2007-03-29,55753.0
4,Before the Summer Passes Away,2007-01-25,467278.0
...,...,...,...
63,Bring Me Home,2019-11-27,507773.0
64,Start-Up,2019-12-18,581530.0
65,Ashfall,2019-12-19,581387.0
66,The Haunted House: The Sky Goblin VS Jormungandr,2019-12-19,654747.0


In [71]:
check =  data_set.dropna(axis = 0)
check

Unnamed: 0,English title,Released,TMDB_id
0,Attack on the Pin - Up Boys,2007-07-26,75100.0
1,The Bank Attack,2007-11-14,482571.0
3,Beautiful Sunday,2007-03-29,55753.0
4,Before the Summer Passes Away,2007-01-25,467278.0
5,Beyond the Years,2007-04-12,155049.0
...,...,...,...
63,Bring Me Home,2019-11-27,507773.0
64,Start-Up,2019-12-18,581530.0
65,Ashfall,2019-12-19,581387.0
66,The Haunted House: The Sky Goblin VS Jormungandr,2019-12-19,654747.0


In [73]:
check['TMDB_id'] = check['TMDB_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  check['TMDB_id'] = check['TMDB_id'].astype(int)


In [74]:
check.head()

Unnamed: 0,English title,Released,TMDB_id
0,Attack on the Pin - Up Boys,2007-07-26,75100
1,The Bank Attack,2007-11-14,482571
3,Beautiful Sunday,2007-03-29,55753
4,Before the Summer Passes Away,2007-01-25,467278
5,Beyond the Years,2007-04-12,155049


In [76]:
check.reset_index(inplace=True)

In [77]:
check.to_csv('raw_koren_07_20.csv')