# Packages importation

In [1]:
import pandas as pd
import numpy as np
import glob
import re
from collections import Counter
from loky import ProcessPoolExecutor

# Functions definition

The **clean_content** function allows you to clean up a history. For example if an user has watched a series, and therefore several episodes, we will have a line for each episode. But we want to keep only the name of the series, and this name in one go.

The **clean_viewing_history** function is used to clean up all the history we have at our disposal.

The **all_history** function is used to parallelize the function above.

In [2]:
def clean_content(hist):
    liste = []
    already = []

    for elem in hist:
        inter = elem.split(':')
        if len(inter)>2:
            if re.search(r"(.*): Saison [0-9]{1,}:", elem):
                liste.append(re.findall(r"(.*): Saison [0-9]{1,}:", elem)[0])
                already.append(elem)
            elif re.search(r"(.*): Partie [0-9I]{1,}:", elem):
                liste.append(re.findall(r"(.*): Partie [0-9I]{1,}:", elem)[0])
                already.append(elem)
            elif re.search(r"(.*): Collection:", elem):
                liste.append(re.findall(r"(.*): Collection:", elem)[0])
                already.append(elem)
            else:
                liste.append(inter[0])
                already.append(elem)
        elif len(inter)!=2:
            liste.append(elem)
            already.append(elem)

    not_yet = [i for i in hist if i not in already]
    already_not_yet = Counter([i.split(':')[0] for i in not_yet]).most_common()
    already_not_yet = [i[0] for i in already_not_yet if i[1]>5]

    for elem in not_yet:
        if len(elem.split(' : '))==2:
            liste.append(elem)
        elif elem.split(':')[0] not in already_not_yet:
            liste.append(elem)

    return np.unique(liste)

def clean_viewing_history(filename):
    historique = pd.read_csv(filename, sep = ',').Title
    historique = [elem.replace('\xa0', ' ').replace('*', '\*') for elem in historique if 'Bande-annonce' not in elem]
    historique = [elem for elem in historique if 'Bientôt disponible' not in elem]
    historique = clean_content(historique)
    return historique

def all_history(key):
    with ProcessPoolExecutor() as epool:
        mapped_values = epool.map(clean_viewing_history, key)
    return list(mapped_values)

# Cleaning viewing history

In [3]:
files = sorted(glob.glob('../Data/Historiques_FilmsVu_Netflix/Historique_user*.csv'))
files

['../Data/Historiques_FilmsVu_Netflix\\Historique_user1.csv',
 '../Data/Historiques_FilmsVu_Netflix\\Historique_user2.csv',
 '../Data/Historiques_FilmsVu_Netflix\\Historique_user3.csv',
 '../Data/Historiques_FilmsVu_Netflix\\Historique_user4.csv']

In [4]:
clean_viewing_history(files[0])[:10]

array(['6 Underground', 'After Life', 'Ahmed Sylla - Avec un grand A',
       'American Psycho', 'American Vandal',
       'Ano Hi Mita Hana no Namae o Bokutachi wa Mada Shiranai', 'Archer',
       "Atlantide : L'empire perdu", 'Atypical', 'Avatar '], dtype='<U77')

In [5]:
hist = all_history(files)
hist

[array(['6 Underground', 'After Life', 'Ahmed Sylla - Avec un grand A',
        'American Psycho', 'American Vandal',
        'Ano Hi Mita Hana no Namae o Bokutachi wa Mada Shiranai', 'Archer',
        "Atlantide : L'empire perdu", 'Atypical', 'Avatar ',
        'B: The Beginning', 'BTOOOM!',
        "Bill Burr: I'm Sorry You Feel That Way", 'Bill Burr: Paper Tiger',
        'Bill Burr: Walk Your Way Out', 'Black Bullet', 'Black Butler',
        'Black Butler : Book of the Atlantic', 'Black Mirror',
        'Black Mirror: Bandersnatch',
        'Blanche Gardin : Je parle toute seule', 'Blue Exorcist',
        'Brooklyn Nine-Nine', 'Charlotte', 'Cowboy Bebop',
        'DanMachi : Familia Myth', 'Dave Chappelle',
        'Dave Chappelle: Equanimity & The Bird Revelation',
        'Dave Chappelle: Sticks & Stones', 'Daybreak',
        'Death Parade: Death Parade', 'Devilman Crybaby', 'Dix pour cent',
        'Désenchantée', 'En bref', 'Entre deux fougères : Le film',
        'Erased', 'Ev

# Exportation

In [6]:
n = len(hist)
for i in range(n):
    path = f"../Data/Historiques_FilmsVu_Netflix/CleanHistory_user{i+1}.txt"
    content = "\n".join(hist[i])
    with open(path,"w", encoding = "utf-8") as f:
        f.write(content)