In [38]:
import requests
from bs4 import BeautifulSoup
import bs4
import pandas as pd
import re
import datetime
from tqdm import tqdm

In [39]:
def requests_and_check(url):
    response = requests.get(url)
    if not response.status_code == 200:
        print("YES")
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        return results_page
    except:
        print("ERROR")
        return None

def get_archives_info(url = 'https://www.annecy.org/about/archives'):
    results_page = requests_and_check(url)

    df = pd.DataFrame(columns = ['title','year','link'])
    if results_page is None:
        return df
    
    for block in results_page.find_all('div',class_= 'clearfix'):
        year_list = block.find_all('li')
        for year in year_list:
            new_row = {}
            new_row['link'] = year.find('a').get('href').strip()
            title_string = year.find('a').get('title')
            new_row['title'] = re.split(r"\s(?=[0-9])", title_string,maxsplit=1)[0].strip()
            new_row['year'] = re.split(r"\s(?=[0-9])", title_string,maxsplit=1)[1].strip()
            df = df.append(new_row,ignore_index=True)
    return df
df = get_archives_info()
df 

Unnamed: 0,title,year,link
0,Check out the archives,2021,https://www.annecy.org/about/archives/2021
1,Check out the archives,2020,https://www.annecy.org/about/archives/2020
2,Check out the archives,2019,https://www.annecy.org/about/archives/2019
3,Check out the archives,2018,https://www.annecy.org/about/archives/2018
4,Check out the archives,2017,https://www.annecy.org/about/archives/2017
...,...,...,...
93,Official selection,1967,https://www.annecy.org/about/archives/1967/off...
94,Official selection,1965,https://www.annecy.org/about/archives/1965/off...
95,Official selection,1963,https://www.annecy.org/about/archives/1963/off...
96,Official selection,1962,https://www.annecy.org/about/archives/1962/off...


In [40]:
def get_official_selection_df_year(
    year_url = 'https://www.annecy.org/about/archives/2021/official-selection',\
    year = '2021'):
    title = 'Official selection'
    results_page = requests_and_check(year_url)
    df = pd.DataFrame(columns = ['title','year','selection','link'])
    
    if results_page is None:
        return df
    
    for i in results_page.find('div',class_='grd-cat__list').find_all('a'):
        new_row = {}
        new_row['selection'] = i.get_text().strip()
        new_row['link'] = i.get('href').strip()
        df = df.append(new_row,ignore_index=True)
    df.title = title
    df.year = year
    return df
get_official_selection_df_year()

Unnamed: 0,title,year,selection,link
0,Official selection,2021,Official Feature Films,https://www.annecy.org/about/archives/2021/off...
1,Official selection,2021,Feature Films Contrechamp,https://www.annecy.org/about/archives/2021/off...
2,Official selection,2021,Official Short Films,https://www.annecy.org/about/archives/2021/off...
3,Official selection,2021,Off-Limits Short Films,https://www.annecy.org/about/archives/2021/off...
4,Official selection,2021,Perspectives Short Films,https://www.annecy.org/about/archives/2021/off...
5,Official selection,2021,Young Audiences Short Films,https://www.annecy.org/about/archives/2021/off...
6,Official selection,2021,Graduation Short Films,https://www.annecy.org/about/archives/2021/off...
7,Official selection,2021,TV Films,https://www.annecy.org/about/archives/2021/off...
8,Official selection,2021,Commissioned Films,https://www.annecy.org/about/archives/2021/off...
9,Official selection,2021,VR Works,https://www.annecy.org/about/archives/2021/off...


In [41]:
def get_awards_df_year(year_url = 'https://www.annecy.org/about/archives/2021/award-winners',\
                      year = '2021'):
    
    title = 'Awards'
    results_page = requests_and_check(year_url)
    df = pd.DataFrame(columns = ['title','year','department','award','film','img_link','film_link'])
    if results_page is None:
        return df
    
    if int(year) >= 2011:
        ls = []
        for i in results_page.find('div',{'id':'palmares'}):
            if type(i) == bs4.element.Tag:
                ls.append(i)
        for i, j in zip(ls[0::2],ls[1::2]):
            department = i.get_text().strip()
            for item in j.find_all('li'):
                item_dict = {}
                item_dict['department'] = department
                item_dict['award'] = item.find('h2').get_text()
                item_dict['film'] = item.find('h4').get_text()
                item_dict['img_link'] = item.find('img').get('src')
                item_dict['film_link'] = item.find('a').get('href')

                df = df.append(item_dict,ignore_index=True)
    else:
        depts = results_page.find('div',class_= 'blc p_com').find_all('div',class_="palm_categ")
        dept_films = results_page.find('div',class_= 'blc p_com').find_all('ul')

        for dept, films in zip(depts,dept_films):
            department = dept.get_text().strip()
            for film in films.find_all('li'):
                item_dict = {}
                item_dict['department'] = department
                item_dict['award'] = film.find('h6').get_text()
                item_dict['film'] = film.find('h3').get_text()
                item_dict['img_link'] = film.find('img').get('src')
                item_dict['film_link'] = film.find('a').get('href')
                
                df = df.append(item_dict,ignore_index=True)
            
    
    df.year = year
    df.title = title
    return df

def get_award_df():
    df = get_archives_info()
    df = df[df.title == 'Découvrez le Palmarés']
    years = df.year.tolist()
    year_urls = df.link.tolist()
    output = pd.DataFrame(columns = ['title','year','department','award','film','img_link','film_link'])
    for year, year_url in zip(years,year_urls): 
        output = pd.concat([output, get_awards_df_year(year_url=year_url,year=year)],ignore_index = True)
    return output

In [42]:
def get_film_info(url = 'https://www.annecy.org/about/archives/2021/official-selection/film-index:film-20211299'):

    results_page = requests_and_check(url)
    if results_page is None:
        return pd.DataFrame()
    
    results = results_page.find("div",class_='blc_identite').find_all('div',class_="sous-blc_content")
    new_dict = {}
    for i in range(len(results)):
        for j in results[i].find_all('p'):
            lt = j.get_text().split(":",1)
            ###TEST
            try:
                new_dict[lt[0].strip()] = lt[1].strip()
            except:
                continue
    if 'df' not in locals():
        df = pd.DataFrame(new_dict,index=[0])
    else:
        df = df.append(new_dict,ignore_index=True)
    return df
get_film_info()

Unnamed: 0,Original title,Directed by,Country,Year of production,Running time,Category,Techniques used,Version,Process,Target public,Production,Distribution,Script,Animation,Compositing,Music,Editing,Voice
0,Affairs of the Art,Joanna QUINN,"United Kingdom, Canada",2021,16 min,Short film,drawing on paper,Original English version French subtitles,Colour,"Teens, Adults, Young adults","BERYL PRODUCTIONS INTERNATIONAL LTD., Les MILL...","ONF - OFFICE NATIONAL DU FILM DU CANADA, Élise...",Les Mills,"Joanna Quinn, James Nutting","Mia Rose Goddard, Fran Breslin",Benjamin Talbott,Mia Rose Goddard,"Menna Trussler, Brendan Charleson, Joanna Quin..."


In [43]:
def get_dept_in_official_selection_year(year, url):
    df = pd.DataFrame(columns = ['year'])
    results_page = requests_and_check(url)
    for dept in results_page.find('div',class_= 'grd-cat__item').find_all('a'):
        department = dept.get_text().strip()
        dept_url = dept.get('href')
        dept_page = requests_and_check(dept_url)
        if dept_page.find('ul',class_='liste_films') is None:
            continue
        for film in dept_page.find('ul',class_='liste_films').find_all('li'):
            film_url = film.find('a').get('href')
            film_df = get_film_info(film_url)
            #test
            if film_df is None:
                return "YES"
            film_df['department'] = department
            df = pd.concat([df,film_df],ignore_index= True)

    df.year = year
    return df
    

In [None]:
%%time
for i in df[df.title=='Official selection'].link.tolist():
    try:
        get_dept_in_official_selection_year(year = 'test',url = i)
        print(i)
    except:
        print("ERROR!!!!!!!!!!!!!!!!!")
        print(i)
        break

https://www.annecy.org/about/archives/2021/official-selection


In [44]:
%%time
get_dept_in_official_selection_year(year='2021', url = 'https://www.annecy.org/about/archives/2021/official-selection')

CPU times: user 23.5 s, sys: 815 ms, total: 24.3 s
Wall time: 4min 13s


Unnamed: 0,year,Original title,Directed by,Country,Year of production,Running time,Category,Techniques used,Version,Process,...,Compositing,Music,Sound,Editing,department,Voice,Sets,Based on,Camera,Other credits
0,2021,Flee,Jonas POHER RASMUSSEN,"Denmark, France, Norway, Sweden",2020,01 h 23 min,Feature film,"2D computer, live action",Original Danish / Russian / French / English s...,Colour,...,"Sylvain Lorent, Johanna Bessière, Florent Bon...",Uno Helmersson,"Fredrik Jonsäter, Edward Björner",Janus Billeskov Jansen,Official Feature Films,,,,,
1,2021,Hayop Ka! The Nimfa Dimaano Story,Avid LIONGOREN,Philippines,2020,01 h 13 min 31 s,Feature film,"cut-outs, 2D computer",Original Philippine version English / French s...,Colour,...,Rocketsheep Studio,Len Calvo,Wapak Studios,,Official Feature Films,"Angelica Panganiban, Robin Padilla, Sam Milby,...",,,,
2,2021,Jiang Ziya: The Legend of Deification,"Wei LI, Teng CHENG",China,2020,01 h 45 min 13 s,Feature film,2D/3D computer,Original Chinese version English / French subt...,Colour,...,Danfeng Li,Yinghua Huang,Danfeng Li,Xinyu Cheng,Official Feature Films,"Xi Zheng, Guangtao Jiang, Guanlin Ji, Ning Yang","Dongming Lu, Wenkang Li",,,
3,2021,Josee to Tora to Sakanatachi,Kotaro TAMURA,Japan,2020,01 h 37 min 08 s,Feature film,2D computer,Original Japanese version French subtitles,Colour,...,,Evan Call,Kazuhiro Wakabayashi,Kumiko Sakamoto,Official Feature Films,,,"""Josee, the Tiger and the Fish"", Seiko Tanabe",Go Kanbayasi,
4,2021,La Traversée,Florence MIAILHE,"Germany, France, Czech Republic",2020,01 h 24 min,Feature film,paint on glass,Original French version English subtitles,Colour,...,,Philipp Kümpel,,,Official Feature Films,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,2021,Majo Minarai o Sagashite,"Junichi SATO, Haruka KAMATANI",Japan,2020,01 h 31 min,Sneak preview,2D computer,Original Japanese version French subtitles,Colour,...,,"Takeshi Ike, Nozomi Inoue",,,Screening Events,"Aoi Morikawa, Rena Matsui, Kanako Momota(momoi...",,Izumi Todo,,
244,2021,Spirit Untamed,"Elaine BOGAN, Ennio TORRESAN",USA,2021,01 h 27 min 46 s,Sneak preview,cg animation,English version French subtitles,Colour,...,,Amie Doherty,,,Screening Events,"Isabela Merced, Marsai Martin, Mckenna Grace, ...",,,,
245,2021,Tomorrow's Leaves,Yoshiyuki MOMOSE,Japan,2021,08 min 30 s,Short film preview,,Without dialogue or commentary,Colour,...,,Takatsugu Muramatsu,,,Screening Events,,,,,
246,2021,Us Again,Zach PARRISH,USA,2021,06 min 46 s,Short film preview,,Without dialogue or commentary,Colour,...,,Pinar Toprak,,,Screening Events,,,,,


In [34]:
url = 'https://www.annecy.org/about/archives/2021/official-selection/film-index:film-20218214'
results_page = requests_and_check(url)

# if results_page is None:
#     return pd.DataFrame()

results = results_page.find("div",class_='blc_identite').find_all('div',class_="sous-blc_content")
new_dict = {}
for i in range(len(results)):
    for j in results[i].find_all('p'):
        lt = j.get_text().split(":",1)
        ###TEST
        try:
            new_dict[lt[0].strip()] = lt[1].strip()
        except:
            continue
print(new_dict)

{'Original title': 'Ascenders', 'Directed by': 'Jonathan ASTRUC', 'Country': 'France', 'Running time': '35 min', 'Category': 'WIP XR', 'Techniques used': 'Virtual reality', 'Process': 'Colour', 'Target public': 'Young adults, Adults', 'Production': 'BACKLIGHT'}
