# Packages importation

In [1]:
from loky import ProcessPoolExecutor
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime
import re
from itertools import chain
import time

# Functions definition

- This function is used to recover the `id_titles` of the content of the database of Netflix from the website `flixable.com` (e.g. : **/title/80997613/**)

In [2]:
def mapper_id_titles(url):
    response = requests.get(url)
    BS = (BeautifulSoup(response.text)
          .select("html body main div#filterContainer.container.pt-2 div.row div.col-sm-6.col-lg-3.mb-3 div.poster-container a.poster-link"))
    return [i.get('href') for i in BS]

In [3]:
# mapper_id_titles('https://fr.flixable.com/genre/{}/?min-rating=0&min-year=1920&max-year=2019&order=title&page=1')

- This function is used to recover all the "interesting" data from the website `flixable.com`

It allows us to retrieve the name of the film, the IMDb site link corresponds to this film, the cast of the film, and the date on which this film was added to netflix.

In [5]:
def mapper_flixable(id_title):
    flixable = "https://fr.flixable.com" + id_title
    response = requests.get(flixable)
    
    nom = (BeautifulSoup(response.text)
           .select("html body main div.container.mt-4 div.row.mb-3 div.col-12 h1.mb-3")[0]
           .text.split("\n")[1].strip())
    
    inter = (BeautifulSoup(response.text)
             .select("div[class='col-md-8 mb-3'] span[class=imdbRatingPlugin] a"))
    if len(inter):
        lien_IMDb = inter[0]['href']
    else:
        lien_IMDb = ""
        
    inter = (BeautifulSoup(response.text)
             .select("html body main div.container.mt-4 div.row.mb-3 div.col-md-8.mb-3 div.mb-2 span a"))
    inter = ["".join(re.findall(r'.*/actor/.*',str(i))) for i in inter]
    inter = [i for i in inter if i!=""]
    casting = "|#|".join(["".join(re.findall(r'.*>(.*)</a>',i))for i in inter])
    
    date_ajout = (BeautifulSoup(response.text)
                  .select("div.mb-4 > span:nth-child(2)"))[0].text
    
    return (nom, lien_IMDb, casting, date_ajout)

In [6]:
# mapper_flixable("/title/70301585/")

- This function is used to recover all the "interesting" data from the website `netflix.com/fr`

We get here the list of genres with each film associated.

In [26]:
def mapper_netflix(id_title):
    netflix = "https://www.netflix.com/fr" + id_title
    response = requests.get(netflix)
    
    inter = (BeautifulSoup(response.text)
             .select("a.title-info-metadata-item"))
    genre1 = [i.text.strip().strip(',').replace('\xa0', ' ') for i in inter]
    
    inter = (BeautifulSoup(response.text)
             .select(".cell-genres > div:nth-child(2) > span"))
    genre2 = [i.text.strip().strip(',').replace('\xa0', ' ') for i in inter]
    
    genre = "|#|".join(set(genre1+genre2))
    
#     test = BeautifulSoup(response.text).select("#section-more-details")[0].text
#     if not re.search(r'.*Ce (?:programme|film) est\.\.\..*', test):
#         tag = ""
#     else:
#         inter = BeautifulSoup(response.text).select("div.more-details-cell div span[class$=tag]")
#         tag = "|#|".join([i.text.strip().strip(',').replace('\xa0', ' ') for i in inter])
        
#     inter = BeautifulSoup(response.text).select(".link-container a")
#     relative = "|#|".join([i['href'].strip('/fr') for i in inter])
    
    return genre#(, relative, tag)

In [28]:
# mapper_netflix("/title/80997613/")

'Comédies|#|Films espagnols'

- This function is used to recover all the "interesting" data from the website `imdb.com`

We recover with this function the IMDb rating of the film.

In [11]:
def mapper_rating_IMDb(id_IMDb):
    if id_IMDb=="":
        return ""
    else:
        response = requests.get(id_IMDb)
        BS = (BeautifulSoup(response.text)
              .select(".ratingValue > strong:nth-child(1) > span:nth-child(1)"))
        if len(BS):
            return float(BS[0].text)
        else:
            return ""

In [12]:
# mapper_rating_IMDb("https://www.imdb.com/title/tt3656200/?ref_=plg_rt_1")

- These functions are used to parallelize the functions above

In [4]:
def all_id_title(key):
    with ProcessPoolExecutor() as epool:
        mapped_values = epool.map(mapper_id_titles, key)
    return list(chain(*mapped_values))

def all_flixable(key):
    with ProcessPoolExecutor() as epool:
        mapped_values_flixable = epool.map(mapper_flixable, key)
    return list(mapped_values_flixable)

def all_netflix(key):
    with ProcessPoolExecutor() as epool:
        mapped_values_netflix = epool.map(mapper_netflix, key)
    return list(mapped_values_netflix)

def all_rating_IMDb(key):
    with ProcessPoolExecutor() as epool:
        mapped_values_rating_IMDb = epool.map(mapper_rating_IMDb, key)
    return list(mapped_values_rating_IMDb)

# Data extraction

**Collecting the amount of pages on the website `flixable.com`**

In [14]:
url = "https://fr.flixable.com/genre/{}/?min-rating=0&min-year=1920&max-year=2020&order=title#filterContainer"
# Here, it's the Netflix's database of France
# For the Netflix's database of USA : https://www.flixable.com/
# For the Netflix's database of Germany : https://de.flixable.com/

types = ["films", "series-tv"]
number_of_pages = {}

for elem in types:
    response = requests.get(url.format(elem))
    BS = BeautifulSoup(response.text).select("li.page-item:nth-child(7) > a:nth-child(1)")[0]
    number_of_pages[elem] = int(BS.get_text())

number_of_pages

{'films': 64, 'series-tv': 37}

**Collecting all URLs from the `flixable.com` website**

In [15]:
url = "https://fr.flixable.com/genre/{}/?min-rating=0&min-year=1920&max-year=2019&order=title&page="
urls = {}

for elem in types:
    N = number_of_pages[elem]
    urls[elem] = [url.format(elem) + str(i+1) for i in range(N)]

In [16]:
for key, value in urls.items():
    print("{} : {}\n".format(key,value[:2]))

films : ['https://fr.flixable.com/genre/films/?min-rating=0&min-year=1920&max-year=2019&order=title&page=1', 'https://fr.flixable.com/genre/films/?min-rating=0&min-year=1920&max-year=2019&order=title&page=2']

series-tv : ['https://fr.flixable.com/genre/series-tv/?min-rating=0&min-year=1920&max-year=2019&order=title&page=1', 'https://fr.flixable.com/genre/series-tv/?min-rating=0&min-year=1920&max-year=2019&order=title&page=2']



**Collecting all `id_titles`**

In [17]:
%%time
id_titles = {}
for key in types:
    id_titles[key] = all_id_title(urls[key])

Wall time: 11.1 s


In [18]:
for key, value in id_titles.items():
    print("For the {}, the amount is {}\n".format(key,len(value)))

Pour les films, ils y en a 2533

Pour les series-tv, ils y en a 1465



In [19]:
for key, value in id_titles.items():
    print("{} : {}\n".format(key,value[:2]))

films : ['/title/80997613/', '/title/80074904/']

series-tv : ['/title/80234414/', '/title/80029823/']



In [20]:
urls_films = id_titles['films']
urls_series = id_titles['series-tv']
numero_titres = urls_films + urls_series

**Collecting data from `flixable.com`**

In [21]:
%%time
data_flixable = {}
for key in types:
    data_flixable[key] = all_flixable(id_titles[key])

Wall time: 5min 34s


Here we have a list of tuples, each tuple and composed of the name of the film, the IMDb site corresponding to this film, the casting, and the date of addition of this film.

In [22]:
for key, value in data_flixable.items():
    print("{} : {}\n".format(key,value[:2]))

films : [('¡Ay, mi madre!', 'https://www.imdb.com/title/tt6213294/?ref_=plg_rt_1', 'Estefanía de los Santos|#|Secun de la Rosa|#|Terele Pávez|#|María Alfonsa Rosso|#|Mariola Fuentes|#|Alfonso Sánchez|#|Paz Vega|#|Marta Torné|#|Concha Galán', '19/7/2019'), ('#Horror', 'https://www.imdb.com/title/tt3526286/?ref_=plg_rt_1', 'Chloë Sevigny|#|Timothy Hutton|#|Natasha Lyonne|#|Balthazar Getty|#|Taryn Manning|#|Stella Schnabel|#|Sadie Seelert|#|Haley Murphy|#|Bridget McGarry|#|Blue Lindberg|#|Mina Sundwall|#|Emma Adler|#|Annabelle Dexter-Jones|#|Lydia Hearst', '30/4/2016')]

series-tv : [('100% Hotter', 'https://www.imdb.com/title/tt5819414/?ref_=plg_rt_1', 'Daniel Palmer|#|Melissa Sophia|#|Grace Woodward|#|Karen Williams', '1/8/2019'), ('12 Monkeys', 'https://www.imdb.com/title/tt3148266/?ref_=plg_rt_1', 'Aaron Stanford|#|Amanda Schull|#|Kirk Acevedo|#|Noah Bean|#|Emily Hampshire|#|Barbara Sukowa|#|Todd Stashwick|#|Tom Noonan|#|Zeljko Ivanek', '1/10/2019')]



In [23]:
infos_flixable = []

for elem in data_flixable.values():
    infos_flixable += elem

In [24]:
title = []
urls_IMDb = []
casting = []
date_ajout = []

for elem in infos_flixable:
    title.append(elem[0])
    urls_IMDb.append(elem[1])
    casting.append(elem[2])
    date_ajout.append(elem[3])

In [25]:
types_media = ['film']*len(urls_films) + ['série']*len(urls_series)
types_media[:5],types_media[-5:]

(['film', 'film', 'film', 'film', 'film'],
 ['série', 'série', 'série', 'série', 'série'])

**Collecting data from `netflix.com/fr`**

Doesn't work because there are too many requests at the same time, so I suppose that Netflix blocks...

```
%%time
data_netflix = {}
for key in types:
    data_netflix[key] = all_netflix(id_titles[key])
```

In [29]:
%%time
infos_netflix = []
URLS = urls_films + urls_series

for elem in URLS:
    infos_netflix.append(mapper_netflix(elem))

Wall time: 1h 21min 30s


In [31]:
infos_netflix[:5]

['Comédies|#|Films espagnols',
 "Films d'horreur|#|Films indépendants",
 'Comédies|#|Films pour ados',
 'Docus société et culture|#|Films documentaires|#|Documentaires sportifs|#|Documentaires historiques',
 'Comédies|#|Films roumains|#|Drames|#|Films pour ados']

In [32]:
# genres = []
# tags = []
# relative = []
genres = [i for i in infos_netflix]
# for elem in infos_netflix:
#     genres.append(elem[0])
#     tags.append(elem[1])
#     relative.append(elem[2])

**Collecting data from `imdb.com`**

In [33]:
%%time
rating = all_rating_IMDb(urls_IMDb)
rating[:10]

Wall time: 9min 2s


[3.9, 3.0, 5.2, 5.3, 6.1, 6.5, 5.1, 5.7, 7.5, 7.3]

# Dataframe creation

In [59]:
# data = {"Titre": title, "Genres Netflix" : genres,
#         "Tags Netflix" : tags, "Casting" : casting,
#         "Note IMDb" : rating, "Films/Séries liés" : relative,
#         "Date d'ajout" : date_ajout, "Lien Netflix" : numero_titres,
#         "Type" : types_media}

data = {"Titre": title, "Genres Netflix" : genres,
        "Casting" : casting, "Note IMDb" : rating,
        "Date d'ajout" : date_ajout, "Lien Netflix" : numero_titres,
        "Type" : types_media}

BDD_Netflix = pd.DataFrame(data)
BDD_Netflix = BDD_Netflix.loc[BDD_Netflix['Genres Netflix']!='',:]

In [60]:
def string_to_datetime(valeur):
    return datetime.strptime(valeur, '%d/%m/%Y')

BDD_Netflix["Date d'ajout"] = BDD_Netflix["Date d'ajout"].apply(string_to_datetime)       

In [61]:
def list_to_string(valeur):
    if valeur=="":
        return np.nan
    else:
        return valeur

#BDD_Netflix["Tags Netflix"] = BDD_Netflix["Tags Netflix"].apply(list_to_string)
BDD_Netflix["Note IMDb"] = BDD_Netflix["Note IMDb"].apply(list_to_string)
BDD_Netflix["Casting"] = BDD_Netflix["Casting"].apply(list_to_string)
#BDD_Netflix["Films/Séries liés"] = BDD_Netflix["Films/Séries liés"].apply(list_to_string)

In [62]:
BDD_Netflix.head()

Unnamed: 0,Titre,Genres Netflix,Casting,Note IMDb,Date d'ajout,Lien Netflix,Type
0,"¡Ay, mi madre!",Comédies|#|Films espagnols,Estefanía de los Santos|#|Secun de la Rosa|#|T...,3.9,2019-07-19,/title/80997613/,film
1,#Horror,Films d'horreur|#|Films indépendants,Chloë Sevigny|#|Timothy Hutton|#|Natasha Lyonn...,3.0,2016-04-30,/title/80074904/,film
2,#realityhigh,Comédies|#|Films pour ados,Nesta Cooper|#|Kate Walsh|#|John Michael Higgi...,5.2,2017-09-08,/title/80125979/,film
3,#Rucker50,Docus société et culture|#|Films documentaires...,,5.3,2016-11-30,/title/80147908/,film
4,#Selfie,Comédies|#|Films roumains|#|Drames|#|Films pou...,Flavia Hojda|#|Crina Semciuc|#|Olimpia Melinte...,6.1,2019-06-01,/title/81092768/,film


In [63]:
BDD_Netflix.tail()

Unnamed: 0,Titre,Genres Netflix,Casting,Note IMDb,Date d'ajout,Lien Netflix,Type
3993,Zoo,Séries US|#|Séries inspirées de livres|#|Série...,James Wolk|#|Kristen Connolly|#|Nonso Anozie|#...,6.8,2019-06-29,/title/80011206/,série
3994,Zou les ours,Pour enfants|#|Séries russes,Nataliya Medvedeva|#|Polina Kutepova,7.8,2018-07-27,/title/80226619/,série
3995,Zumbo's Just Desserts,Émissions de cuisine et voyages|#|Séries austr...,Adriano Zumbo|#|Rachel Khoo,6.9,2018-03-09,/title/80204927/,série
3996,마녀사냥,Talk-shows|#|Séries coréennes|#|Séries|#|Diver...,Si-kyung Sung|#|Se-yoon Yoo|#|Dong-yup Shin|#|...,5.2,2018-02-18,/title/80226338/,série
3997,용팔이,Séries romantiques|#|Séries dramatiques|#|Séri...,Joo Won|#|Kim Tae-hee|#|Jo Hyun-jae|#|Chae Jun...,7.5,2018-10-15,/title/80998966/,série


In [64]:
BDD_Netflix.shape

(3996, 7)

# Exportation to `csv` and `pickle`

In [65]:
BDD_Netflix.to_csv('../Data/BDD_Netflix.csv', sep = ';', index = False, encoding = 'utf-8', na_rep = 'NA')

In [66]:
BDD_Netflix.to_pickle('../Data/BDD_Netflix.pkl')

# Sources

- https://fr.flixable.com


- https://www.netflix.com/fr/


- https://pandas.pydata.org/pandas-docs/stable/reference/frame.html


- https://www.crummy.com/software/BeautifulSoup/bs4/doc/


- https://docs.python.org/2/library/re.html