In [37]:
import csv
from memory_profiler import memory_usage



def loading_from_csv_DictReader(filename):
    with open(filename, encoding="utf8") as csvfile:
        reader = csv.DictReader(csvfile)
        data = list(reader)
    return data



def extract_top5_DictReader(data):
    
    for artist in data:
        artist['popularity'] = float(artist['popularity'])
    top5_artists = sorted(data, key=lambda x: x['popularity'], reverse=True)[:5]
    
    return [{'name': artist['name'], 'popularity': artist['popularity']} for artist in top5_artists]



%load_ext memory_profiler


filename = '/Users/theoverdelhan/Documents/EDUCATION/FG4A DATA/Data models/artists_rev1.csv'


time_measures={}
memory_measures={}

# Mesurer le temps et la mémoire pour charger les données
time_measures["csv_dictreader_load"] = %timeit -o loading_from_csv_DictReader(filename)
memory_measures["csv_dictreader_load"] = %memit -c -o loading_from_csv_DictReader(filename)


# Charger les données pour l'extraction
data = loading_from_csv_DictReader(filename)

# Mesurer le temps et la mémoire pour l'extraction des 5 meilleurs artistes
time_measures["csv_dictreader_extract"] = %timeit -o extract_top5_DictReader(data)
memory_measures["csv_dictreader_extract"] = %memit -c -o extract_top5_DictReader(data)




The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
916 ms ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1431.59 MiB, increment: 71.73 MiB
172 ms ± 630 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1430.23 MiB, increment: 61.50 MiB


In [38]:
data = loading_from_csv_DictReader(filename)
print("Nb of rows: ",len(data))
print("Example of rows extracted: ", data[:3])

Nb of rows:  1162095
Example of rows extracted:  [{'id': '0DheY5irMjBUeLybbCUEZ2', 'followers': '0.0', 'genres': '[""]', 'name': 'Armid & Amir Zare Pashai feat. Sara Rouzbehani', 'popularity': '0'}, {'id': '0DlhY15l3wsrnlfGio2bjU', 'followers': '5.0', 'genres': '[""]', 'name': 'ปูนา ภาวิณี', 'popularity': '0'}, {'id': '0DmRESX2JknGPQyO15yxg7', 'followers': '0.0', 'genres': '[""]', 'name': 'Sadaa', 'popularity': '0'}]


In [39]:
print(extract_top5_DictReader(data))

[{'name': 'Justin Bieber', 'popularity': 100.0}, {'name': 'Bad Bunny', 'popularity': 98.0}, {'name': 'Taylor Swift', 'popularity': 98.0}, {'name': 'Drake', 'popularity': 98.0}, {'name': 'Juice WRLD', 'popularity': 96.0}]


exercice 2

In [40]:
import csv

def loading_from_csv_reader(filename):
    with open(filename, encoding="utf8") as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)  # Récupère les en-têtes
        data = list(reader)    # Lit le reste des données
    return header, data



def extract_top5_reader(header, data):
    # Trouver l'index des colonnes 'name' et 'popularity'
    name_idx = header.index('name')
    popularity_idx = header.index('popularity')

    # Trier les données en fonction de la popularité
    top5_artists = sorted(data, key=lambda x: float(x[popularity_idx]), reverse=True)[:5]

    # Retourner les noms et popularités des 5 meilleurs artistes
    return [{'name': artist[name_idx], 'popularity': float(artist[popularity_idx])} for artist in top5_artists]





time_measures["csv_reader_load"] = %timeit -o loading_from_csv_reader(filename)
memory_measures["csv_reader_load"] = %memit -c -o loading_from_csv_reader(filename)

header,data = loading_from_csv_reader(filename)

time_measures["csv_reader_extract"] = %timeit -o extract_top5_reader(header, data)
memory_measures["csv_reader_extract"] = %memit -c -o extract_top5_reader(header, data)



509 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1904.53 MiB, increment: 63.80 MiB
169 ms ± 1.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1844.33 MiB, increment: 61.69 MiB


exercice 3

In [41]:
import pandas as pd

def loading_from_pandas_csv(filename):
    return pd.read_csv(filename, encoding="utf8")


def extract_top5_pandas(data):
    # Trier les données par popularité et sélectionner les 5 premières lignes
    sorted_data = data.sort_values(by="popularity", ascending=False).head(5)
    return sorted_data[["name", "popularity"]]



%load_ext memory_profiler



time_measures["pandas_csv_load"] = %timeit -o loading_from_pandas_csv (filename)
memory_measures["pandas_csv_load"] = %memit -c -o loading_from_pandas_csv (filename)

data = loading_from_pandas_csv(filename)

time_measures["pandas_csv_extract"] = %timeit -o extract_top5_pandas(data)
memory_measures["pandas_csv_extract"] = %memit -c -o extract_top5_pandas(data)


The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
737 ms ± 9.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1860.11 MiB, increment: 61.70 MiB
84.1 ms ± 456 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1630.64 MiB, increment: 62.11 MiB


exercice 4

In [42]:
import pandas as pd

def loading_from_pandas_csv_optimized(filename):
    return pd.read_csv(filename, encoding="utf8", usecols=['name', 'popularity'])


%load_ext memory_profiler



time_measures["pandas_csv_optim_load"] = %timeit -o loading_from_pandas_csv_optimized(filename)
memory_measures["pandas_csv_optim_load"] = %memit -c -o loading_from_pandas_csv_optimized(filename)

data = loading_from_pandas_csv_optimized(filename)

time_measures["pandas_csv_optim_extract"] = %timeit -o extract_top5_pandas(data)
memory_measures["pandas_csv_optim_extract"] = %memit -c -o extract_top5_pandas(data)


The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
426 ms ± 3.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1635.06 MiB, increment: 61.52 MiB
57.9 ms ± 88.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1444.19 MiB, increment: 62.64 MiB


In [43]:
time_measures

{'csv_dictreader_load': <TimeitResult : 916 ms ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>,
 'csv_dictreader_extract': <TimeitResult : 172 ms ± 630 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)>,
 'csv_reader_load': <TimeitResult : 509 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>,
 'csv_reader_extract': <TimeitResult : 169 ms ± 1.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)>,
 'pandas_csv_load': <TimeitResult : 737 ms ± 9.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>,
 'pandas_csv_extract': <TimeitResult : 84.1 ms ± 456 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)>,
 'pandas_csv_optim_load': <TimeitResult : 426 ms ± 3.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>,
 'pandas_csv_optim_extract': <TimeitResult : 57.9 ms ± 88.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)>}

In [44]:
memory_measures

{'csv_dictreader_load': <MemitResult : peak memory: 1431.59 MiB, increment: 71.73 MiB>,
 'csv_dictreader_extract': <MemitResult : peak memory: 1430.23 MiB, increment: 61.50 MiB>,
 'csv_reader_load': <MemitResult : peak memory: 1904.53 MiB, increment: 63.80 MiB>,
 'csv_reader_extract': <MemitResult : peak memory: 1844.33 MiB, increment: 61.69 MiB>,
 'pandas_csv_load': <MemitResult : peak memory: 1860.11 MiB, increment: 61.70 MiB>,
 'pandas_csv_extract': <MemitResult : peak memory: 1630.64 MiB, increment: 62.11 MiB>,
 'pandas_csv_optim_load': <MemitResult : peak memory: 1635.06 MiB, increment: 61.52 MiB>,
 'pandas_csv_optim_extract': <MemitResult : peak memory: 1444.19 MiB, increment: 62.64 MiB>}