# Import thư viện 

In [1]:
import numpy as np 
import pandas as pd 
import tmdbv3api as tmdb 

In [2]:
from tmdbv3api import TMDb, Movie, Search
from os import getenv
tmdb = TMDb()

tmdb.api_key = getenv('TMDB_API_KEY')
search = Search()

# New form

In [3]:
from typing import List

"""
- Hàm lấy tên, ngày film từ wiki

"""
def get_films_on_wiki(year: int, prelink: str, title_label: str): 
    link = prelink + str(year)

    data = pd.read_html(link)
    if len(data) < 6: return None
    sub_df = pd.concat([data[2], data[3], data[4], data[5]])
  
    res = sub_df[['Opening', title_label, "Opening.1"]].copy()
    res.dropna(inplace=True)
    res.rename(columns={"Opening":"Month",title_label: "English title", "Opening.1": "Day"}, inplace=True)
    return res


- Sử lại Opening: lower_case + remove space + chuyển sang int 
- Kết hợp Opening.1 + Opening

In [4]:
dct = {
    'january' : 1, 
    'february' : 2,
    'march' : 3, 
    'april' :  4,
    'may' : 5, 
    'june' : 6, 
    'july' : 7, 
    'august' : 8, 
    'september' : 9, 
    'october' : 10, 
    'november' : 11, 
    'december' : 12
}

def combine_day_with_month(x: tuple, year : int):
    return  str(year) + "-" + str(x[0]) + "-" + str(x[1])

def month_to_int(month : str) : 
    if month not in  dct.keys():
        return None 
    return dct[month]

def released_handling(year: int, df: pd.DataFrame):
   
    df['Month'] = df['Month'].str.lower()
    df['Month'] = df['Month'].str.replace(' ', "")

    df['Month']  = df['Month'].apply(lambda x: month_to_int(x)) #con
    
    df['Day'] = df['Day'].astype(int)
    df['Released'] = df[['Month', 'Day']].apply(lambda x: combine_day_with_month(x, year), axis=1)
    df['Released'] = pd.to_datetime(df['Released'])
    df.drop(['Month', 'Day'], inplace = True, axis=1)
    return df

    pass

In [5]:
def get_tmdb_id(title, release_date):
    if release_date is None: return None
    t  = release_date.year
    movie = search.movies(title, release_year=t)

    for j in movie['results']: 
        if j['release_date'] == release_date.strftime('%Y-%m-%d'): 
            return j['id']
        
    return None

In [6]:
def gather_data(country: str, year_start: int, year_end): 
    prelink = 'https://en.wikipedia.org/wiki/List_of_' + country + "_films_of_"
    data_set = pd.DataFrame(columns=['Released', 'English title', 'TMDB_id']) # create
    for i in range(year_start, year_end + 1): 
        print(f'Year: {i}')
        compo_df = get_films_on_wiki(i, prelink, "Title")
        
        if compo_df is None: continue
        compo_clean = released_handling(df= compo_df, year=i)

        compo_clean['TMDB_id'] = compo_clean[['English title', 'Released']].apply(lambda x: get_tmdb_id(x[0], x[1]), axis = 1)

        data_set = pd.concat([data_set, compo_clean])
        
        
    return data_set

In [None]:
t = gather_data("American", 1970, 1979)

In [17]:
t.dropna(inplace=True)

In [18]:
t['TMDB_id'] = t['TMDB_id'].astype(int)

In [19]:
t

Unnamed: 0,Released,English title,TMDB_id
1,1970-01-02,Jenny,280133
3,1970-01-09,...tick...tick...tick...,85255
4,1970-01-14,Last of the Mobile Hot Shots,117999
5,1970-01-14,The Dunwich Horror,65891
7,1970-01-21,The Only Game in Town,84481
...,...,...,...
47,1979-12-21,Cuba,62001
48,1979-12-21,C.H.O.M.P.S.,77593
49,1979-12-21,The Electric Horseman,11145
50,1979-12-21,Scavenger Hunt,23050


In [21]:
t.reset_index(inplace=True)

In [22]:
t.to_csv('raw_usa_1970_1979.csv')

In [None]:
for i in range(1980, 2021, 10): 
    t = gather_data("American",i, i + 9)
    t.dropna(inplace=True)
    t['TMDB_id'] = t['TMDB_id'].astype(int)
    t.reset_index(inplace=True)
    path = 'raw_usa_' + str(i) + 's.csv'
    t.to_csv(path)