In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from tqdm import tqdm

In [None]:
def n_page(review_link):
    request_review = requests.get(review_link)
    soup_review = BeautifulSoup(request_review.content, "lxml")
    page = soup_review.find("div", {"class": "pagination-item-holder"})
    try:
        n_pages =page.find_all("span")[-1].string
    except:
        n_pages = 1
    return(int(n_pages))

In [None]:
def extract_links_from_page(y,i):
    url = "http://www.allocine.fr/films/decennie-2010/annee-"+str(y)+"/?page="+str(i)
    request = requests.get(url)
    page = request.content
    soup = BeautifulSoup(page, "lxml")
    films = soup.find_all("a", {"class": "meta-title-link"})
    films_link = [tag['href'] for tag in soup.find_all('a', {'class':"meta-title-link"})]
    films_title = [film.string.strip() for film in films]
    Dict_films = {}
    header = "http://www.allocine.fr"
    for i in range(len(films_title)):
        Dict_films[str(films_title[i])] = str(header + films_link[i])
    return(Dict_films)

In [None]:
def extract_movie_data(url):
  request_film = requests.get(url)
  page_film = request_film.content
  soup_film = BeautifulSoup(page_film, "lxml")
  tags_to_get = ['<div class="content-txt " itemprop="description">']
  film_id = soup_film.find("main", {"class": "content-layout entity movie cf seance-geoloc-redir"}).get("data-seance-geoloc-redir")
  title = soup_film.find("div", {"class": "titlebar-title titlebar-title-lg"})
  title = title.text.strip()
  author = soup_film.find("div", {"class": "meta-body-direction"}).find("span", {"class": "blue-link"})
  author = author.text.strip()
  genre_html = soup_film.find("div", {"class": "meta-body-info"}).findAll("span")
  genre = []
  for i in genre_html:
    if "20" not in i.text and "/" not in i.text:
      genre.append(i.text)
  extract_actors = soup_film.findAll("div", {"class": "meta-body-item"})[2]
  actors = [act.text for act in extract_actors.findAll("span") if not(act.text in ["Avec", " plus "])]
  date = soup_film.find("span", {"class": "date"}).text.strip()

  return {'film_id': film_id, 'title': title, 'author': author, 'genre': genre, 'actors': actors, 'date': date}

In [None]:
def extract_reviews_data(url):
    film_id = BeautifulSoup(requests.get(url).content, "lxml").find("main", {"class": "content-layout entity movie cf seance-geoloc-redir"}).get("data-seance-geoloc-redir")
    n = n_page(url)
    reviews = []
    rank = 0
    for j in range(1, n+1):
        request_reviews = requests.get(url+"?page="+str(j))
        soup_reviews = BeautifulSoup(request_reviews.content, "lxml")
        soup_comments = soup_reviews.findAll("div", {"class": "review-card"})
        for soup_comment in soup_comments:
          date = soup_comment.find("span", {"class": "review-card-meta-date"})
          comment = soup_comment.find("div", {"class": "review-card-content"})
          reviews.append({"film_id": film_id, "rank": rank, "comment": comment.text.strip(), "date": (' ').join(date.text.strip().split(' ')[-3:])})
          rank += 1
    return reviews

# Scrapping

In [None]:
y = 2019
films = {}
n = n_page("http://www.allocine.fr/films/decennie-2010/annee-"+str(y))
for i in tqdm(1, range(n+1)):
  films.update(extract_links_from_page(y, i))

100%|██████████| 5/5 [00:05<00:00,  1.10s/it]


In [None]:
film_data = []
for film in tqdm(films):
  try:
    film_data.append(extract_movie_data(films[film]))
  except:
    pass
film_data = pd.DataFrame(film_data)

100%|██████████| 60/60 [01:05<00:00,  1.09s/it]


In [None]:
film_data.head()

Unnamed: 0,film_id,title,author,genre,actors,date
0,246819,Coma - Esprits prisonniers,Nikita Argunov,"[Science fiction, Action]","[Rinal Mukhametov, Lyubov Aksyonova, Milos Bik...",19 août 2020
1,272648,Into the Beat,Stefan Westerwelle,"[Famille, Drame]","[Alexandra Pfeifer, Yalany Marschner, Ina Gera...",16 avril 2021
2,275355,30 jours max,Tarek Boudali,"[Comédie, Policier]","[Tarek Boudali, Philippe Lacheau, Julien Arruti]",14 octobre 2020
3,268345,Joyeuse retraite !,Fabrice Bracq,[Comédie],"[Thierry Lhermitte, Michèle Laroque, Nicole Fe...",20 novembre 2019
4,255238,Parasite,Bong Joon Ho,[Thriller],"[Song Kang-Ho, Woo-sik Choi, Park So-Dam]",5 juin 2019


In [None]:
film_data.to_csv('film_data.csv', index=False)

In [None]:
film_reviews = pd.DataFrame()
for film_id in tqdm(film_data.film_id):
  try:
    tmp = pd.DataFrame(extract_reviews_data(f"https://www.allocine.fr/film/fichefilm-{film_id}/critiques/spectateurs/"))
    film_reviews = film_reviews.append(tmp)
  except:
    pass

100%|██████████| 59/59 [15:47<00:00, 16.07s/it]


In [None]:
film_reviews.head()

Unnamed: 0,film_id,rank,comment,date
0,246819,0,Le film était un régal visuel pour tous les am...,20 août 2020
1,246819,1,Scénario innovant qui tient bien la route tout...,16 août 2020
2,246819,2,"Si le parallèle avec ""Inception"" vient assez r...",16 août 2020
3,246819,3,Un film qui aurait pu être pas mal ... et même...,31 août 2020
4,246819,4,La pauvreté de l'intrigue au récit laborieux e...,15 août 2020


In [None]:
film_reviews.to_csv('film_reviews.csv', index=False)