In [1]:
import requests
from bs4 import BeautifulSoup
import bs4
import pandas as pd
import re
import datetime
from tqdm import tqdm

In [2]:
def requests_and_check(url):
    response = requests.get(url)
    if not response.status_code == 200:
        print("YES")
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        return results_page
    except:
        print("ERROR")
        return None

def get_archives_info(url = 'https://www.annecy.org/about/archives'):
    results_page = requests_and_check(url)

    df = pd.DataFrame(columns = ['title','year','link'])
    if results_page is None:
        return df
    
    for block in results_page.find_all('div',class_= 'clearfix'):
        year_list = block.find_all('li')
        for year in year_list:
            new_row = {}
            new_row['link'] = year.find('a').get('href').strip()
            title_string = year.find('a').get('title')
            new_row['title'] = re.split(r"\s(?=[0-9])", title_string,maxsplit=1)[0].strip()
            new_row['year'] = re.split(r"\s(?=[0-9])", title_string,maxsplit=1)[1].strip()
            df = df.append(new_row,ignore_index=True)
    return df
get_archives_info()

Unnamed: 0,title,year,link
0,Check out the archives,2021,https://www.annecy.org/about/archives/2021
1,Check out the archives,2020,https://www.annecy.org/about/archives/2020
2,Check out the archives,2019,https://www.annecy.org/about/archives/2019
3,Check out the archives,2018,https://www.annecy.org/about/archives/2018
4,Check out the archives,2017,https://www.annecy.org/about/archives/2017
...,...,...,...
93,Official selection,1967,https://www.annecy.org/about/archives/1967/off...
94,Official selection,1965,https://www.annecy.org/about/archives/1965/off...
95,Official selection,1963,https://www.annecy.org/about/archives/1963/off...
96,Official selection,1962,https://www.annecy.org/about/archives/1962/off...


In [3]:
def get_official_selection_df_year(
    year_url = 'https://www.annecy.org/about/archives/2021/official-selection',\
    year = '2021'):
    title = 'Official selection'
    results_page = requests_and_check(year_url)
    df = pd.DataFrame(columns = ['title','year','selection','link'])
    
    if results_page is None:
        return df
    
    for i in results_page.find('div',class_='grd-cat__list').find_all('a'):
        new_row = {}
        new_row['selection'] = i.get_text().strip()
        new_row['link'] = i.get('href').strip()
        df = df.append(new_row,ignore_index=True)
    df.title = title
    df.year = year
    return df
get_official_selection_df_year()

Unnamed: 0,title,year,selection,link
0,Official selection,2021,Official Feature Films,https://www.annecy.org/about/archives/2021/off...
1,Official selection,2021,Feature Films Contrechamp,https://www.annecy.org/about/archives/2021/off...
2,Official selection,2021,Official Short Films,https://www.annecy.org/about/archives/2021/off...
3,Official selection,2021,Off-Limits Short Films,https://www.annecy.org/about/archives/2021/off...
4,Official selection,2021,Perspectives Short Films,https://www.annecy.org/about/archives/2021/off...
5,Official selection,2021,Young Audiences Short Films,https://www.annecy.org/about/archives/2021/off...
6,Official selection,2021,Graduation Short Films,https://www.annecy.org/about/archives/2021/off...
7,Official selection,2021,TV Films,https://www.annecy.org/about/archives/2021/off...
8,Official selection,2021,Commissioned Films,https://www.annecy.org/about/archives/2021/off...
9,Official selection,2021,VR Works,https://www.annecy.org/about/archives/2021/off...


In [4]:
def get_awards_df_year(year_url = 'https://www.annecy.org/about/archives/2021/award-winners',\
                      year = '2021'):
    
    title = 'Awards'
    results_page = requests_and_check(year_url)
    df = pd.DataFrame(columns = ['title','year','department','award','film','img_link','film_link'])
    if results_page is None:
        return df
    
    if int(year) >= 2011:
        ls = []
        for i in results_page.find('div',{'id':'palmares'}):
            if type(i) == bs4.element.Tag:
                ls.append(i)
        for i, j in zip(ls[0::2],ls[1::2]):
            department = i.get_text().strip()
            for item in j.find_all('li'):
                item_dict = {}
                item_dict['department'] = department
                item_dict['award'] = item.find('h2').get_text()
                item_dict['film'] = item.find('h4').get_text()
                item_dict['img_link'] = item.find('img').get('src')
                item_dict['film_link'] = item.find('a').get('href')

                df = df.append(item_dict,ignore_index=True)
    else:
        depts = results_page.find('div',class_= 'blc p_com').find_all('div',class_="palm_categ")
        dept_films = results_page.find('div',class_= 'blc p_com').find_all('ul')

        for dept, films in zip(depts,dept_films):
            department = dept.get_text().strip()
            for film in films.find_all('li'):
                item_dict = {}
                item_dict['department'] = department
                item_dict['award'] = film.find('h6').get_text()
                item_dict['film'] = film.find('h3').get_text()
                item_dict['img_link'] = film.find('img').get('src')
                item_dict['film_link'] = film.find('a').get('href')
                
                df = df.append(item_dict,ignore_index=True)
            
    
    df.year = year
    df.title = title
    return df

def get_award_df():
    df = get_archives_info()
    df = df[df.title == 'Découvrez le Palmarés']
    years = df.year.tolist()
    year_urls = df.link.tolist()
    output = pd.DataFrame(columns = ['title','year','department','award','film','img_link','film_link'])
    for year, year_url in zip(years,year_urls): 
        output = pd.concat([output, get_awards_df_year(year_url=year_url,year=year)],ignore_index = True)
    return output
get_award_df()

Unnamed: 0,title,year,department,award,film,img_link,film_link
0,Awards,2021,Feature films,Cristal for a Feature Film,Flee,https://www.annecy.org/resources/images/m/2021...,https://www.annecy.org/about/archives/2021/awa...
1,Awards,2021,Feature films,Jury Award,Ma famille afghane,https://www.annecy.org/resources/images/m/2021...,https://www.annecy.org/about/archives/2021/awa...
2,Awards,2021,Feature films,Jury Distinction,La Traversée,https://www.annecy.org/resources/images/m/2021...,https://www.annecy.org/about/archives/2021/awa...
3,Awards,2021,Feature films,Gan Foundation Award for Distribution,Flee,https://www.annecy.org/resources/images/m/2021...,https://www.annecy.org/about/archives/2021/awa...
4,Awards,2021,Feature films,Contrechamp Award,Bob Cuspe - Nós Não Gostamos de Gente,https://www.annecy.org/resources/images/m/2021...,https://www.annecy.org/about/archives/2021/awa...
...,...,...,...,...,...,...,...
860,Awards,1960,Short films,Award for its humanitarian theme and graphical...,Pozor!,https://www.annecy.org/resources/images/t/6010...,https://www.annecy.org/about/archives/1960/awa...
861,Awards,1960,Short films,Award for aesthetic research\n,"Prélude pour voix, orchestre et caméra",https://www.annecy.org/resources/images/t/defa...,https://www.annecy.org/about/archives/1960/awa...
862,Awards,1960,Short films,Award for poetry and popular art\n,Vlyublennoe oblako,https://www.annecy.org/resources/images/t/defa...,https://www.annecy.org/about/archives/1960/awa...
863,Awards,1960,Short Films out of Competition,International Critics' Award\n,Moonbird,https://www.annecy.org/resources/images/t/6000...,https://www.annecy.org/about/archives/1960/awa...


In [5]:
def get_film_info(url='https://www.annecy.org/about/archives/2021/official-selection/film-index:film-20211299'):
    results_page = requests_and_check(url)
    if results_page is None:
        return pd.DataFrame()
    new_dict = {}

    results = results_page.find("div", class_='blc_identite').find_all('div', class_="sous-blc_content")

    for i in range(len(results)):
        for j in results[i].find_all('p'):
            lt = j.get_text().split(":", 1)
            ###TEST
            try:
                new_dict[lt[0].strip()] = lt[1].strip()
            except:
                continue
    try:
        new_dict['Overview'] = results_page.find('div', class_='accroche').get_text().strip()
    except:
        new_dict['Overview'] = ''
    if 'df' not in locals():
        df = pd.DataFrame(new_dict, index=[0])
    else:
        df = df.append(new_dict, ignore_index=True)
    return df


get_film_info()

Unnamed: 0,Original title,Directed by,Country,Year of production,Running time,Category,Techniques used,Version,Process,Target public,Production,Distribution,Script,Animation,Compositing,Music,Editing,Voice,Overview
0,Affairs of the Art,Joanna QUINN,"United Kingdom, Canada",2021,16 min,Short film,drawing on paper,Original English version French subtitles,Colour,"Teens, Adults, Young adults","BERYL PRODUCTIONS INTERNATIONAL LTD., Les MILL...","ONF - OFFICE NATIONAL DU FILM DU CANADA, Élise...",Les Mills,"Joanna Quinn, James Nutting","Mia Rose Goddard, Fran Breslin",Benjamin Talbott,Mia Rose Goddard,"Menna Trussler, Brendan Charleson, Joanna Quin...","Beryl's back in ""Affairs of the Art"", which sh..."


In [6]:
def get_dept_in_official_selection_year(year, url):
    df = pd.DataFrame(columns = ['year'])
    results_page = requests_and_check(url)
    for dept in results_page.find('div',class_= 'grd-cat__item').find_all('a'):
        department = dept.get_text().strip()
        dept_url = dept.get('href')
        dept_page = requests_and_check(dept_url)
        if dept_page.find('ul',class_='liste_films') is None:
            continue
        for film in dept_page.find('ul',class_='liste_films').find_all('li'):
            film_url = film.find('a').get('href')
            film_df = get_film_info(film_url)
            #test
            if film_df is None:
                return "YES"
            film_df['department'] = department
            df = pd.concat([df,film_df],ignore_index= True)

    df.year = year
    return df
    

In [12]:
%%time
for i in df[df.title=='Official selection'].link.tolist():
    try:
        get_dept_in_official_selection_year(year = 'test',url = i)
        print(i)
        break
    except:
        print("ERROR!!!!!!!!!!!!!!!!!")
        print(i)
        break

https://www.annecy.org/about/archives/2021/official-selection
CPU times: user 22.3 s, sys: 826 ms, total: 23.1 s
Wall time: 3min 59s


In [45]:
%%time
for i in df[df.title=='Official selection'].link.tolist():
    try:
        get_dept_in_official_selection_year(year = 'test',url = i)
        print(i)
    except:
        print("ERROR!!!!!!!!!!!!!!!!!")
        print(i)
        break

https://www.annecy.org/about/archives/2021/official-selection
https://www.annecy.org/about/archives/2020/official-selection
https://www.annecy.org/about/archives/2019/official-selection
https://www.annecy.org/about/archives/2018/official-selection
https://www.annecy.org/about/archives/2017/official-selection
https://www.annecy.org/about/archives/2016/official-selection
https://www.annecy.org/about/archives/2015/official-selection
https://www.annecy.org/about/archives/2014/official-selection
https://www.annecy.org/about/archives/2013/official-selection
https://www.annecy.org/about/archives/2012/official-selection
https://www.annecy.org/about/archives/2011/official-selection
https://www.annecy.org/about/archives/2010/official-selection
https://www.annecy.org/about/archives/2009/official-selection
https://www.annecy.org/about/archives/2008/official-selection
https://www.annecy.org/about/archives/2007/official-selection
https://www.annecy.org/about/archives/2006/official-selection
https://