# Importation des packages

In [1]:
from loky import ProcessPoolExecutor
import os
import requests
from bs4 import BeautifulSoup
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import re
from itertools import chain
import time

# Définition de fonctions

### Les "mapper" pour récupérer les `id_titles`

In [3]:
def mapper_id_titles(url):
    response = requests.get(url)
    BS = (BeautifulSoup(response.text)
          .select("html body main div#filterContainer.container.pt-2 div.row div.col-sm-6.col-lg-3.mb-3 div.poster-container a.poster-link"))
    return [i.get('href') for i in BS]

In [4]:
# mapper_id_titles('https://fr.flixable.com/genre/{}/?min-rating=0&min-year=1920&max-year=2019&order=title&page=1')

### Parallélisation pour récupérer les `id_titles`

In [5]:
def all_id_title(key):
    with ProcessPoolExecutor() as epool:
        mapped_values = epool.map(mapper_id_titles, key)
    return list(chain(*mapped_values))

### Les "mapper" sur le site `flixable.com`

In [6]:
def mapper_flixable(id_title):
    flixable = "https://fr.flixable.com" + id_title
    response = requests.get(flixable)
    
    nom = (BeautifulSoup(response.text)
           .select("html body main div.container.mt-4 div.row.mb-3 div.col-12 h1.mb-3")[0]
           .text.split("\n")[1].strip())
    
    inter = (BeautifulSoup(response.text)
             .select("div[class='col-md-8 mb-3'] span[class=imdbRatingPlugin] a"))
    if len(inter):
        lien_IMDb = inter[0]['href']
    else:
        lien_IMDb = ""
        
    inter = (BeautifulSoup(response.text)
             .select("html body main div.container.mt-4 div.row.mb-3 div.col-md-8.mb-3 div.mb-2 span a"))
    inter = ["".join(re.findall(r'.*/actor/.*',str(i))) for i in inter]
    inter = [i for i in inter if i!=""]
    casting = "|#|".join(["".join(re.findall(r'.*>(.*)</a>',i))for i in inter])
    
    date_ajout = (BeautifulSoup(response.text)
                  .select("div.mb-4 > span:nth-child(2)"))[0].text
    
    return (nom, lien_IMDb, casting, date_ajout)

In [7]:
# mapper_flixable("/title/70301585/")

### Parallélisation pour récupérer les données sur le site `flixable.com`

In [8]:
def all_flixable(key):
    with ProcessPoolExecutor() as epool:
        mapped_values_flixable = epool.map(mapper_flixable, key)
    return list(mapped_values_flixable)

### Les "mapper" sur le site `netflix.com/fr`

In [9]:
def mapper_netflix(id_title):
    netflix = "https://www.netflix.com/fr" + id_title
    response = requests.get(netflix)
    
    inter = (BeautifulSoup(response.text)
             .select("a.title-info-metadata-item"))
    genre1 = [i.text.strip().strip(',').replace('\xa0', ' ') for i in inter]
    
    inter = (BeautifulSoup(response.text)
             .select(".cell-genres > div:nth-child(2) > span"))
    genre2 = [i.text.strip().strip(',').replace('\xa0', ' ') for i in inter]
    
    genre = "|#|".join(set(genre1+genre2))
    
    test = BeautifulSoup(response.text).select("#section-more-details")[0].text
    if not re.search(r'.*Ce (?:programme|film) est\.\.\..*', test):
        tag = ""
    else:
        inter = BeautifulSoup(response.text).select("div.more-details-cell div span[class$=tag]")
        tag = "|#|".join([i.text.strip().strip(',').replace('\xa0', ' ') for i in inter])
        
    inter = BeautifulSoup(response.text).select(".link-container a")
    relative = "|#|".join([i['href'].strip('/fr') for i in inter])
    
    return (genre, tag, relative)

In [10]:
# mapper_netflix("/title/70301585/")

### Parallélisation pour récupérer les données sur `netflix.com/fr`

In [11]:
def all_netflix(key):
    with ProcessPoolExecutor() as epool:
        mapped_values_netflix = epool.map(mapper_netflix, key)
    return list(mapped_values_netflix)

### Les "mapper" sur le site `imdb.com`

In [12]:
def mapper_rating_IMDb(id_IMDb):
    if id_IMDb=="":
        return ""
    else:
        response = requests.get(id_IMDb)
        BS = (BeautifulSoup(response.text)
              .select(".ratingValue > strong:nth-child(1) > span:nth-child(1)"))
        if len(BS):
            return float(BS[0].text)
        else:
            return ""

In [13]:
# mapper_rating_IMDb("https://www.imdb.com/title/tt3656200/?ref_=plg_rt_1")

### Parallélisation pour récupérer les données sur `imdb.com`

In [14]:
def all_rating_IMDb(key):
    with ProcessPoolExecutor() as epool:
        mapped_values_rating_IMDb = epool.map(mapper_rating_IMDb, key)
    return list(mapped_values_rating_IMDb)

# Récupération des données

### Récupération du nombre de pages sur le site `flixable.com`

In [15]:
url = "https://fr.flixable.com/genre/{}/?min-rating=0&min-year=1920&max-year=2020&order=title#filterContainer"
types = ["films", "series-tv"]
number_of_pages = {}

for elem in types:
    response = requests.get(url.format(elem))
    BS = BeautifulSoup(response.text).select("li.page-item:nth-child(7) > a:nth-child(1)")[0]
    number_of_pages[elem] = int(BS.get_text())

number_of_pages

{'films': 64, 'series-tv': 36}

### Récupération des urls du site `flixable.com`

In [16]:
url = "https://fr.flixable.com/genre/{}/?min-rating=0&min-year=1920&max-year=2019&order=title&page="
urls = {}

for elem in types:
    N = number_of_pages[elem]
    urls[elem] = [url.format(elem) + str(i+1) for i in range(N)]

In [17]:
for key, value in urls.items():
    print("{} : {}\n".format(key,value[:2]))

films : ['https://fr.flixable.com/genre/films/?min-rating=0&min-year=1920&max-year=2019&order=title&page=1', 'https://fr.flixable.com/genre/films/?min-rating=0&min-year=1920&max-year=2019&order=title&page=2']

series-tv : ['https://fr.flixable.com/genre/series-tv/?min-rating=0&min-year=1920&max-year=2019&order=title&page=1', 'https://fr.flixable.com/genre/series-tv/?min-rating=0&min-year=1920&max-year=2019&order=title&page=2']



### Récupération des `id_titles`

In [18]:
%%time
id_titles = {}
for key in types:
    id_titles[key] = all_id_title(urls[key])
    
#(~10s)

Wall time: 10.8 s


In [19]:
for key, value in id_titles.items():
    print("Pour les {}, ils y en a {}\n".format(key,len(value)))

Pour les films, ils y en a 2537

Pour les series-tv, ils y en a 1429



In [20]:
for key, value in id_titles.items():
    print("{} : {}\n".format(key,value[:2]))

films : ['/title/80997613/', '/title/70301585/']

series-tv : ['/title/80234414/', '/title/80029823/']



In [21]:
urls_films = id_titles['films']
urls_series = id_titles['series-tv']
numero_titres = urls_films + urls_series

### Récupération des données sur le site `flixable.com`

In [22]:
%%time
data_flixable = {}
for key in types:
    data_flixable[key] = all_flixable(id_titles[key])
    
#(~6min10)

Wall time: 6min 12s


In [23]:
for key, value in data_flixable.items():
    print("{} : {}\n".format(key,value[:2]))

films : [('¡Ay, mi madre!', 'https://www.imdb.com/title/tt6213294/?ref_=plg_rt_1', 'Estefanía de los Santos|#|Secun de la Rosa|#|Terele Pávez|#|María Alfonsa Rosso|#|Mariola Fuentes|#|Alfonso Sánchez|#|Paz Vega|#|Marta Torné|#|Concha Galán', '19/7/2019'), ("'71", 'https://www.imdb.com/title/tt2614684/?ref_=plg_rt_1', "Charlie Murphy|#|Paul Anderson|#|Sam Reid|#|Richard Dormer|#|Sean Harris|#|Killian Scott|#|Martin McCann|#|Corey McKinley|#|David Wilmot|#|Jack O'Connell|#|Charlie Murphy|#|Paul W.S. Anderson|#|Sam Hazeldine", '1/12/2018')]

series-tv : [('100% Hotter', 'https://www.imdb.com/title/tt5819414/?ref_=plg_rt_1', 'Daniel Palmer|#|Melissa Sophia|#|Grace Woodward|#|Karen Williams', '1/8/2019'), ('12 Monkeys', 'https://www.imdb.com/title/tt3148266/?ref_=plg_rt_1', 'Aaron Stanford|#|Amanda Schull|#|Kirk Acevedo|#|Noah Bean|#|Emily Hampshire|#|Barbara Sukowa|#|Todd Stashwick|#|Tom Noonan|#|Zeljko Ivanek', '1/10/2019')]



In [24]:
infos_flixable = []

for elem in data_flixable.values():
    infos_flixable += elem

In [25]:
title = []
urls_IMDb = []
casting = []
date_ajout = []

for elem in infos_flixable:
    title.append(elem[0])
    urls_IMDb.append(elem[1])
    casting.append(elem[2])
    date_ajout.append(elem[3])

In [26]:
types_media = ['film']*len(urls_films) + ['série']*len(urls_series)
types_media[:5],types_media[-5:]

(['film', 'film', 'film', 'film', 'film'],
 ['série', 'série', 'série', 'série', 'série'])

### Récupération des données sur le site `netflix.com/fr`

Ne fonctionne pas car trop de requests en même temps donc Netflix bloque
```
%%time
data_netflix = {}
for key in types:
    data_netflix[key] = all_netflix(id_titles[key])
```

In [28]:
%%time
infos_netflix = []
URLS = urls_films + urls_series

for elem in URLS:
    infos_netflix.append(mapper_netflix(elem))

#(~1h05min)

Wall time: 1h 3min 45s


In [36]:
genres = []
tags = []
relative = []

for elem in infos_netflix:
    genres.append(elem[0])
    tags.append(elem[1])
    relative.append(elem[2])

### Récupération des données sur le site `imdb.com`

In [30]:
%%time
rating = all_rating_IMDb(urls_IMDb)
rating[:10]
#(~9min)

NameError: name 'ratings' is not defined

# Création du dataframe

In [37]:
data = {"Titre": title, "Genres Netflix" : genres,
        "Tags Netflix" : tags, "Casting" : casting,
        "Note IMDb" : rating, "Films/Séries liés" : relative,
        "Date d'ajout" : date_ajout, "Lien Netflix" : numero_titres,
        "Type" : types_media}

BDD_Netflix = pd.DataFrame(data)

In [40]:
def string_to_datetime(valeur):
    return datetime.strptime(valeur, '%d/%m/%Y')

BDD_Netflix["Date d'ajout"] = BDD_Netflix["Date d'ajout"].apply(string_to_datetime)       

In [41]:
def list_to_string(valeur):
    if valeur=="":
        return np.nan
    else:
        return valeur

BDD_Netflix["Tags Netflix"] = BDD_Netflix["Tags Netflix"].apply(list_to_string)
BDD_Netflix["Note IMDb"] = BDD_Netflix["Note IMDb"].apply(list_to_string)
BDD_Netflix["Casting"] = BDD_Netflix["Casting"].apply(list_to_string)
BDD_Netflix["Films/Séries liés"] = BDD_Netflix["Films/Séries liés"].apply(list_to_string)

In [45]:
BDD_Netflix.head()

Unnamed: 0,Titre,Genres Netflix,Tags Netflix,Casting,Note IMDb,Films/Séries liés,Date d'ajout,Lien Netflix,Type
0,"¡Ay, mi madre!",Comédies|#|Films espagnols,Sentimental,Estefanía de los Santos|#|Secun de la Rosa|#|T...,3.8,title/80230423|#|title/80103425|#|title/802355...,2019-07-19,/title/80997613/,film
1,'71,Action et aventure|#|Thrillers d'action|#|Film...,Violent,Charlie Murphy|#|Paul Anderson|#|Sam Reid|#|Ri...,7.2,title/80118916|#|title/80041653|#|title/801335...,2018-12-01,/title/70301585/,film
2,#Horror,Films indépendants|#|Films d'horreur,Suspense,Chloë Sevigny|#|Timothy Hutton|#|Natasha Lyonn...,3.0,title/70060008|#|title/80026887|#|title/800946...,2016-04-30,/title/80074904/,film
3,#realityhigh,Films pour ados|#|Comédies,,Nesta Cooper|#|Kate Walsh|#|John Michael Higgi...,5.2,title/80239639|#|title/81019888|#|title/801435...,2017-09-08,/title/80125979/,film
4,#Rucker50,Films documentaires|#|Documentaires historique...,,,5.4,title/80011846|#|title/80223149|#|title/801426...,2016-11-30,/title/80147908/,film


In [50]:
BDD_Netflix.tail()

Unnamed: 0,Titre,Genres Netflix,Tags Netflix,Casting,Note IMDb,Films/Séries liés,Date d'ajout,Lien Netflix,Type
3961,Zoo,Séries inspirées de livres|#|Thrillers TV|#|Sé...,Suspense|#|Palpitant,James Wolk|#|Kristen Connolly|#|Nonso Anozie|#...,6.8,title/80105699|#|title/80097140|#|title/702809...,2019-06-29,/title/80011206/,série
3962,Zou les ours,Séries russes|#|Pour enfants,,Nataliya Medvedeva|#|Polina Kutepova,7.7,title/81099996|#|title/80045811|#|title/801896...,2018-07-27,/title/80226619/,série
3963,Zumbo's Just Desserts,"Divertissement, variété et talk-shows|#|Séries...",Palpitant,Adriano Zumbo|#|Rachel Khoo,6.9,title/80201328|#|title/80201866|#|title/801860...,2018-03-09,/title/80204927/,série
3964,마녀사냥,"Séries coréennes|#|Divertissement, variété et ...",,Si-kyung Sung|#|Se-yoon Yoo|#|Dong-yup Shin|#|...,5.2,title/80176929|#|title/80176842|#|title/801769...,2018-02-18,/title/80226338/,série
3965,용팔이,Séries coréennes|#|Séries dramatiques romantiq...,Palpitant|#|Romantique,Joo Won|#|Kim Tae-hee|#|Jo Hyun-jae|#|Chae Jun...,,title/80998941|#|title/81042516|#|title/800295...,2018-10-15,/title/80998966/,série


# Exportation en `csv` et `pickle`

In [52]:
BDD_Netflix.to_csv('../Data/BDD_Netflix.csv', sep = ';', index = False, encoding = 'utf-8', na_rep = 'NA')

In [53]:
BDD_Netflix.to_pickle('../Data/BDD_Netflix.pkl')

***Notre base de données s'arrête au 24 novembre 2019.***

# Sources

- https://fr.flixable.com/?min-rating=0&min-year=1920&max-year=2020&order=title#filterContainer


- https://www.netflix.com/fr/


- http://akul.me/blog/2016/beautifulsoup-cheatsheet/