# Packages importation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pyspark
from pyspark.sql import SQLContext

# Functions definition

In [2]:
def StringToList(x):
    if isinstance(x, float):
        return x
    else:
        return x.split("|#|")
    
def DeleteCaractere(x):
    if isinstance(x, float):
        return x
    else:
        return x.replace('"',"'")

# This function add +1 to the value of a dictionary's key
def appendOrAdd_dico(dico, key):
    if key in dico:
        dico[key]+=1
    else:
        dico[key]=1
    return dico

# First, this function filter and keep all id_titles (`Lien Netflix`) where the word `mot` appears in the columns `col_name`.
# Then, it creates links between all of these id_titles and add them to a dictionary.
# Finally, it returns the dictionary of links between id_titles (ex : {'/title/70060018/	/title/70270776/':2})
def create_links(mot, col_name, dico, BDD = "BDD_NetflixSQL"):
    request = f"""SELECT `Lien Netflix`
                  FROM {BDD}
                  WHERE (`{col_name}` LIKE "{mot}")
                     OR (`{col_name}` LIKE "%|{mot}")
                     OR (`{col_name}` LIKE "%|{mot}|%")
                     OR (`{col_name}` LIKE "{mot}|%")"""
    
    inter = sqlContext.sql(request).toPandas()['Lien Netflix']
    
    for i in range(len(inter)):
        for j in range(i+1,len(inter)):
            name1 = inter[i]
            name2 = inter[j]
            key = f"{name1}\t{name2}"
            dico = appendOrAdd_dico(dico, key)
            
    return dico

# Data importation

- When working with Apache Spark we invoke methods on an object which is an instance of the `pyspark.SparkContext` context.

- Typically, an instance of this object will be created automatically for you and assigned to the variable `sc`.

In [3]:
sc = pyspark.SparkContext(master="local[*]", appName="FirstExample")
sqlContext = SQLContext(sc)

In [4]:
BDD_Netflix_Spark = sqlContext.read.csv("../Data/BDD_Netflix.csv", sep = ";", header = True, inferSchema=True)
BDD_Netflix_Spark.createOrReplaceTempView("BDD_NetflixSQL")

In [5]:
BDDFilm = BDD_Netflix_Spark.filter(BDD_Netflix_Spark.Type=="film")
BDDFilm.createOrReplaceTempView("BDD_Film")

BDDSerie = BDD_Netflix_Spark.filter(BDD_Netflix_Spark.Type=="série")
BDDSerie.createOrReplaceTempView("BDD_Serie")

In [6]:
BDD_Netflix = pd.read_csv('../Data/BDD_Netflix.csv', sep = ';')
BDD_Netflix = BDD_Netflix.set_index('Lien Netflix')
BDD_Netflix['Casting'] = BDD_Netflix['Casting'].apply(DeleteCaractere)

BDD_Film = BDD_Netflix.loc[BDD_Netflix['Type']=='film',:]
BDD_Serie = BDD_Netflix.loc[BDD_Netflix['Type']=='série',:]

BDD_Netflix.head()

Unnamed: 0_level_0,Titre,Genres Netflix,Casting,Note IMDb,Date d'ajout,Type
Lien Netflix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
/title/80997613/,"¡Ay, mi madre!",Comédies|#|Films espagnols,Estefanía de los Santos|#|Secun de la Rosa|#|T...,3.9,2019-07-19,film
/title/80074904/,#Horror,Films d'horreur|#|Films indépendants,Chloë Sevigny|#|Timothy Hutton|#|Natasha Lyonn...,3.0,2016-04-30,film
/title/80125979/,#realityhigh,Comédies|#|Films pour ados,Nesta Cooper|#|Kate Walsh|#|John Michael Higgi...,5.2,2017-09-08,film
/title/80147908/,#Rucker50,Docus société et culture|#|Films documentaires...,,5.3,2016-11-30,film
/title/81092768/,#Selfie,Comédies|#|Films roumains|#|Drames|#|Films pou...,Flavia Hojda|#|Crina Semciuc|#|Olimpia Melinte...,6.1,2019-06-01,film


```python
BDD_Netflix['Genres Netflix'] = BDD_Netflix['Genres Netflix'].apply(StringToList)
BDD_Netflix['Tags Netflix'] = BDD_Netflix['Tags Netflix'].apply(StringToList)
BDD_Netflix['Casting'] = BDD_Netflix['Casting'].apply(StringToList)
BDD_Netflix['Films/Séries liés'] = BDD_Netflix['Films/Séries liés'].apply(StringToList)

BDD_Netflix.head()
```

**Number of appearances of the most common "genres/tags/actrices-acteurs/contenu"**

In [7]:
Liste_Genres = "|#|".join(BDD_Netflix['Genres Netflix']).split("|#|")
Liste_Genres_film = "|#|".join(BDD_Film['Genres Netflix']).split("|#|")
Liste_Genres_serie = "|#|".join(BDD_Serie['Genres Netflix']).split("|#|")

#Liste_Tags = "|#|".join(BDD_Netflix['Tags Netflix'].dropna()).split("|#|")

Liste_Casting = "|#|".join(BDD_Netflix['Casting'].dropna()).split("|#|")
Liste_Casting_film = "|#|".join(BDD_Film['Casting'].dropna()).split("|#|")
Liste_Casting_serie = "|#|".join(BDD_Serie['Casting'].dropna()).split("|#|")

#Liste_Liaisons = "|#|".join(BDD_Netflix['Films/Séries liés'].dropna()).split("|#|")

In [8]:
dico_genres = Counter(Liste_Genres)
dico_genres_film = Counter(Liste_Genres_film)
dico_genres_serie = Counter(Liste_Genres_serie)

#dico_tags = Counter(Liste_Tags)

dico_casting = Counter(Liste_Casting)
dico_casting_film = Counter(Liste_Casting_film)
dico_casting_serie = Counter(Liste_Casting_serie)

#dico_liaisons = Counter(Liste_Liaisons)

We remove the three most common genres to avoid having a hyperconnected graph

In [9]:
genres_3_most_common = dico_genres.most_common(3)
genres_3_most_common = [i[0] for i in genres_3_most_common]

genres_film_3_most_common = dico_genres_film.most_common(3)
genres_film_3_most_common = [i[0] for i in genres_film_3_most_common]

genres_serie_3_most_common = dico_genres_serie.most_common(3)
genres_serie_3_most_common = [i[0] for i in genres_serie_3_most_common]

```python
>>>dico_genres.most_common(10)
[('Comédies', 958),
 ('Drames', 593),
 ('Films documentaires', 438),
 ('Séries US', 389),
 ('Action et aventure', 369),
 ('Séries comiques', 352),
 ('Pour enfants', 329),
 ('Films inspirés de livres', 323),
 ('Films romantiques', 309),
 ('Séries dramatiques', 266)]
```

***\"Factors\"*** **de genres/tags/actrices-acteurs/contenu**

In [10]:
Liste_Genres = np.unique(Liste_Genres)
# If you want to delete the 3 most common genres, use the following line
#Liste_Genres = [i for i in Liste_Genres if i not in genres_3_most_common]
Liste_Genres_film = np.unique(Liste_Genres_film)
#Liste_Genres_film = [i for i in Liste_Genres_film if i not in genres_film_3_most_common]
Liste_Genres_serie = np.unique(Liste_Genres_serie)
#Liste_Genres_serie = [i for i in Liste_Genres_serie if i not in genres_serie_3_most_common]

#Liste_Tags = np.unique(Liste_Tags)

Liste_Casting = np.unique(Liste_Casting)
Liste_Casting_film = np.unique(Liste_Casting_film)
Liste_Casting_serie = np.unique(Liste_Casting_serie)

#Liste_Liaisons = np.unique(Liste_Liaisons)

## Links creation between contents according to genres

These chunks create 3 dictionaries (one for the full database, one for the movie database, one for the serie database). Keys of these dictionaries are the link between 2 contents and the values are the numbers of genres they have in common.

In [11]:
%%time
col_name = "Genres Netflix"
dico_links_genres = {}
for genre in Liste_Genres:
    dico_links_genres = create_links(genre, col_name, dico_links_genres)
    #print(genre)

Wall time: 2min 16s


In [12]:
sorted(dico_links_genres.items(), key=lambda x: -x[1])[:10]

[('/title/80115466/\t/title/81033473/', 10),
 ('/title/80115466/\t/title/60024788/', 10),
 ('/title/81033473/\t/title/60024788/', 10),
 ('/title/81045826/\t/title/81045823/', 10),
 ('/title/70242943/\t/title/70117694/', 10),
 ('/title/80193514/\t/title/80115466/', 9),
 ('/title/80193514/\t/title/81033473/', 9),
 ('/title/80193514/\t/title/80180376/', 9),
 ('/title/80193514/\t/title/80198623/', 9),
 ('/title/80193514/\t/title/60024788/', 9)]

In [13]:
len(dico_links_genres)

1378633

In [14]:
%%time
col_name = "Genres Netflix"
BDD = "BDD_Film"
dico_links_genres_film = {}
for genre in Liste_Genres_film:
    dico_links_genres_film = create_links(genre, col_name, dico_links_genres_film, BDD)
    #print(genre)

Wall time: 1min 36s


In [15]:
sorted(dico_links_genres_film.items(), key=lambda x: -x[1])[:10]

[('/title/80115466/\t/title/81033473/', 10),
 ('/title/80115466/\t/title/60024788/', 10),
 ('/title/81033473/\t/title/60024788/', 10),
 ('/title/81045826/\t/title/81045823/', 10),
 ('/title/70242943/\t/title/70117694/', 10),
 ('/title/80193514/\t/title/80115466/', 9),
 ('/title/80193514/\t/title/81033473/', 9),
 ('/title/80193514/\t/title/80180376/', 9),
 ('/title/80193514/\t/title/80198623/', 9),
 ('/title/80193514/\t/title/60024788/', 9)]

In [16]:
len(dico_links_genres_film)

950933

In [17]:
%%time
col_name = "Genres Netflix"
BDD = "BDD_Serie"
dico_links_genres_serie = {}
for genre in Liste_Genres_serie:
    dico_links_genres_serie = create_links(genre, col_name, dico_links_genres_serie, BDD)
    #print(genre)

Wall time: 51.2 s


In [18]:
sorted(dico_links_genres_serie.items(), key=lambda x: -x[1])[:10]

[('/title/80236253/\t/title/80041089/', 9),
 ('/title/80228274/\t/title/80236253/', 8),
 ('/title/80228274/\t/title/80041089/', 8),
 ('/title/80986993/\t/title/80116920/', 8),
 ('/title/80992853/\t/title/80005874/', 7),
 ('/title/80182123/\t/title/81010662/', 7),
 ('/title/80007226/\t/title/80023687/', 7),
 ('/title/80201328/\t/title/81094391/', 7),
 ('/title/80991872/\t/title/81045551/', 7),
 ('/title/80104022/\t/title/80133187/', 7)]

In [19]:
len(dico_links_genres_serie)

358607

## Links creation between contents according to tags

```python
%%time
col_name = "Tags Netflix"
dico_links_tags = {}
for tag in Liste_Tags:
    dico_links_tags = create_links(tag, col_name, dico_links_tags)
    #print(tag)
```

```python
sorted(dico_links_tags.items(), key=lambda x: -x[1])[:10]
```

## Links creation between contents according to the casting

Choice to make on the number of appearances of the actor or actress (for example : if the actor/actress appears only once in a Netflix content, we remove him/her from the `Liste_Casting` in order to not launch the function for nothing. Here, we decided to keep only actors/actresses that appears more than 5 times in the Netflix database (arbitrary choice).

In [20]:
i = 0
Liste_Casting_sup1 = []
dico_casting = sorted(dico_casting.items(), key=lambda x: -x[1])

while dico_casting[i][1]>5:
    Liste_Casting_sup1.append(dico_casting[i][0])
    i=i+1

i = 0
Liste_Casting_sup1_film = []
dico_casting_film = sorted(dico_casting_film.items(), key=lambda x: -x[1])

while dico_casting_film[i][1]>5:
    Liste_Casting_sup1_film.append(dico_casting_film[i][0])
    i=i+1
    
i = 0
Liste_Casting_sup1_serie = []
dico_casting_serie = sorted(dico_casting_serie.items(), key=lambda x: -x[1])

while dico_casting_serie[i][1]>5:
    Liste_Casting_sup1_serie.append(dico_casting_serie[i][0])
    i=i+1

In [21]:
len(Liste_Casting_sup1),len(Liste_Casting_sup1_film),len(Liste_Casting_sup1_serie)

(464, 235, 85)

These chunks create 3 dictionaries (one for the full database, one for the movie database, one for the serie database). Keys of these dictionaries are the link between 2 contents and the values are the numbers of actors/actresses they have in common.

In [22]:
%%time
col_name = "Casting"
dico_links_casting = {}
for casting in Liste_Casting_sup1:
    dico_links_casting = create_links(casting, col_name, dico_links_casting)
    #print(casting)

Wall time: 2min 4s


In [23]:
sorted(dico_links_casting.items(), key=lambda x: -x[1])[:10]

[('/title/80039813/\t/title/80117800/', 12),
 ('/title/80180376/\t/title/80198623/', 10),
 ('/title/80999067/\t/title/80235524/', 10),
 ('/title/80999067/\t/title/81021243/', 10),
 ('/title/80235524/\t/title/81021243/', 10),
 ('/title/60021299/\t/title/80039813/', 10),
 ('/title/80180376/\t/title/80180373/', 9),
 ('/title/80198623/\t/title/80180373/', 9),
 ('/title/81006335/\t/title/80999067/', 9),
 ('/title/81006335/\t/title/80999063/', 9)]

In [24]:
len(dico_links_casting)

12234

In [25]:
%%time
col_name = "Casting"
BDD = "BDD_Film"
dico_links_casting_film = {}
for casting in Liste_Casting_sup1_film:
    dico_links_casting_film = create_links(casting, col_name, dico_links_casting_film,BDD)
    #print(casting)

Wall time: 1min 4s


In [26]:
sorted(dico_links_casting_film.items(), key=lambda x: -x[1])[:10]

[('/title/80999067/\t/title/80999069/', 9),
 ('/title/80999067/\t/title/80235524/', 9),
 ('/title/80999067/\t/title/81021243/', 9),
 ('/title/80999069/\t/title/80235524/', 9),
 ('/title/80999069/\t/title/81021243/', 9),
 ('/title/80235524/\t/title/81021243/', 9),
 ('/title/81006335/\t/title/80999067/', 8),
 ('/title/81006335/\t/title/80999069/', 8),
 ('/title/81006335/\t/title/80999063/', 8),
 ('/title/81006335/\t/title/80235524/', 8)]

In [27]:
len(dico_links_casting_film)

5415

In [28]:
%%time
col_name = "Casting"
BDD = "BDD_Serie"
dico_links_casting_serie = {}
for casting in Liste_Casting_sup1_serie:
    dico_links_casting_serie = create_links(casting, col_name, dico_links_casting_serie,BDD)
    #print(casting)

Wall time: 23.2 s


In [29]:
sorted(dico_links_casting_serie.items(), key=lambda x: -x[1])[:10]

[('/title/80117781/\t/title/81054849/', 7),
 ('/title/70304252/\t/title/80040119/', 6),
 ('/title/80040119/\t/title/81054849/', 5),
 ('/title/81002336/\t/title/80124711/', 5),
 ('/title/80183051/\t/title/80117291/', 4),
 ('/title/80180849/\t/title/80986797/', 4),
 ('/title/80191075/\t/title/70234440/', 4),
 ('/title/80182123/\t/title/81010662/', 4),
 ('/title/70201870/\t/title/80040119/', 4),
 ('/title/80183051/\t/title/80043576/', 3)]

In [30]:
len(dico_links_casting_serie)

2107

## Links creation between contents according to the column `Films/Séries liés`

```python
%%time
col_name = "Films/Séries liés"
dico_links_liaison = {}
for liaison in Liste_Liaisons:
    dico_links_liaison = create_links(liaison, col_name, dico_links_liaison)
    #print(liaison)
```

```python
sorted(dico_links_liaison.items(), key=lambda x: -x[1])[:10]
```

## Gatherings of different dictionaries

In [31]:
%%time
# We have decided to only take into account the links according to genres and the casting
dicoAll = [dico_links_genres, dico_links_casting]
dicoAll_film = [dico_links_genres_film, dico_links_casting_film]
dicoAll_serie = [dico_links_genres_serie, dico_links_casting_serie]

# sum the values with same keys (a weight could be taken into consideration but not here)
counter = Counter() 
for d in dicoAll:  
    counter.update(d) 
      
result = dict(counter)

counter_film = Counter() 
for d in dicoAll_film:  
    counter_film.update(d) 
      
result_film = dict(counter_film) 

counter_serie = Counter() 
for d in dicoAll_serie:  
    counter_serie.update(d) 
      
result_serie = dict(counter_serie) 

Wall time: 1.32 s


In [32]:
sorted(result.items(), key=lambda x: -x[1])[:10]

[('/title/80180376/\t/title/80198623/', 19),
 ('/title/80180376/\t/title/80180373/', 18),
 ('/title/80198623/\t/title/80180373/', 18),
 ('/title/80039813/\t/title/80117800/', 16),
 ('/title/70258995/\t/title/70276596/', 14),
 ('/title/80115466/\t/title/80180376/', 13),
 ('/title/80115466/\t/title/80198623/', 13),
 ('/title/80115466/\t/title/80180373/', 13),
 ('/title/81033473/\t/title/60024788/', 13),
 ('/title/80004762/\t/title/80198142/', 13)]

# Nodes weight

Here, the weight of a node is it rating on IMDb

In [33]:
dico_nodes = {link:BDD_Netflix["Note IMDb"][link] for link in BDD_Netflix.index}
dico_nodes_film = {link:BDD_Film["Note IMDb"][link] for link in BDD_Film.index}
dico_nodes_serie = {link:BDD_Serie["Note IMDb"][link] for link in BDD_Serie.index}

dico_nodes

{'/title/80997613/': 3.9,
 '/title/80074904/': 3.0,
 '/title/80125979/': 5.2,
 '/title/80147908/': 5.3,
 '/title/81092768/': 6.1,
 '/title/81092766/': 6.5,
 '/title/70060018/': 5.1,
 '/title/81172729/': 5.7,
 '/title/80141173/': 7.5,
 '/title/80113785/': 7.3,
 '/title/70141813/': 7.6,
 '/title/81033429/': 5.8,
 '/title/80135164/': 6.3,
 '/title/81121181/': 7.2,
 '/title/70270776/': 6.7,
 '/title/70303426/': 6.9,
 '/title/70108779/': 5.8,
 '/title/80161826/': 5.9,
 '/title/70208247/': 7.2,
 '/title/70298992/': 7.0,
 '/title/80198771/': 6.3,
 '/title/80244679/': 6.4,
 '/title/60034573/': 6.1,
 '/title/81035844/': 2.8,
 '/title/70167074/': 6.1,
 '/title/70056440/': 7.6,
 '/title/70269488/': 6.2,
 '/title/70071602/': 7.9,
 '/title/80232740/': 4.0,
 '/title/80221109/': 5.9,
 '/title/80087897/': 6.7,
 '/title/80216612/': nan,
 '/title/70233894/': 5.8,
 '/title/80045922/': 5.6,
 '/title/80130625/': 5.8,
 '/title/80178280/': 6.2,
 '/title/81001887/': 6.1,
 '/title/80095197/': 6.8,
 '/title/801

# Relation between id_title et name of the content

In [34]:
dico_title = {link:BDD_Netflix["Titre"][link] for link in BDD_Netflix.index}
dico_title_film = {link:BDD_Film["Titre"][link] for link in BDD_Film.index}
dico_title_serie = {link:BDD_Serie["Titre"][link] for link in BDD_Serie.index}

dico_title

{'/title/80997613/': '¡Ay, mi madre!',
 '/title/80074904/': '#Horror',
 '/title/80125979/': '#realityhigh',
 '/title/80147908/': '#Rucker50',
 '/title/81092768/': '#Selfie',
 '/title/81092766/': '#Selfie 69',
 '/title/70060018/': '10000',
 '/title/81172729/': '10 jours à Sun City',
 '/title/80141173/': '100 mètres',
 '/title/80113785/': '1000 Rupee Note',
 '/title/70141813/': '127 Heures',
 '/title/81033429/': '15 August',
 '/title/80135164/': '1922',
 '/title/81121181/': '1944, un tunnel en enfer',
 '/title/70270776/': '2 Guns',
 '/title/70303426/': '2 States',
 '/title/70108779/': '2012',
 '/title/80161826/': '2015 Dream Concert',
 '/title/70208247/': '21 Jump Street',
 '/title/70298992/': '22 Jump Street',
 '/title/80198771/': '26 Years',
 '/title/80244679/': '3 minutes ensemble',
 '/title/60034573/': '30 ans sinon rien',
 '/title/81035844/': '30 Days of Luxury',
 '/title/70167074/': '30 minutes maximum',
 '/title/70056440/': '300',
 '/title/70269488/': "300 : La naissance d'un empi

# Exportation

- The links

In [35]:
with open("../Data/Liaisons.txt","w", encoding = "utf-8") as f:
    for key, value in result.items():
        f.write(f"{key}\t{value}\n")
        
with open("../Data/Liaisons_film.txt","w", encoding = "utf-8") as f:
    for key, value in result_film.items():
        f.write(f"{key}\t{value}\n")
        
with open("../Data/Liaisons_serie.txt","w", encoding = "utf-8") as f:
    for key, value in result_serie.items():
        f.write(f"{key}\t{value}\n")

- The nodes weight

In [36]:
with open("../Data/Nodes_weight.txt","w", encoding = "utf-8") as f:
    for key, value in dico_nodes.items():
        f.write(f"{key}\t{value}\n")
        
with open("../Data/Nodes_weight_film.txt","w", encoding = "utf-8") as f:
    for key, value in dico_nodes_film.items():
        f.write(f"{key}\t{value}\n")
        
with open("../Data/Nodes_weight_serie.txt","w", encoding = "utf-8") as f:
    for key, value in dico_nodes_serie.items():
        f.write(f"{key}\t{value}\n")

- The titles

In [37]:
with open("../Data/Titles.txt","w", encoding = "utf-8") as f:
    for key, value in dico_title.items():
        f.write(f"{key}\t{value}\n")

with open("../Data/Titles_film.txt","w", encoding = "utf-8") as f:
    for key, value in dico_title_film.items():
        f.write(f"{key}\t{value}\n")
        
with open("../Data/Titles_serie.txt","w", encoding = "utf-8") as f:
    for key, value in dico_title_serie.items():
        f.write(f"{key}\t{value}\n")