Benchmarking data

In [1]:
# Save the result as a CSV file for future use.
file_name = "benchmarking_data.csv"

In [2]:
import pandas as pd

data_URL = "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/diabetes.csv"

original_data = pd.read_csv(data_URL)

dup_times = 10000
benchmarking_df = original_data.loc[original_data.index.repeat(dup_times)]

benchmarking_df.to_csv(file_name, index=False)

In [3]:
import pandas as pd
import polars as pl
import vaex as vx
import datatable as dt
import plotly.express as px
import time
import tracemalloc as tm
import os
import psutil

Benchmarking Execution Time

In [4]:
def get_df(library_name, file_name):

    final_time = 0

    start_time = time.time()

    if library_name.lower() == 'polars':
        df = pl.read_csv(file_name)

    elif library_name.lower() == 'pandas':
        df = pd.read_csv(file_name)

    elif library_name.lower() == 'vaex':
        df = vx.read_csv(file_name)

    elif library_name.lower() == 'datatable':
        df = dt.fread(file_name)

    else:
        raise ValueError("Invalid library name. Must be 'polars', 'pandas', 'vaex', or 'datatable'")

    return df

def read_csv(library_name, df, file_name='benchmarking_data.csv'):

    final_time = 0

    start_time = time.time()

    if library_name.lower() == 'polars':
        df = pl.read_csv(file_name)

    elif library_name.lower() == 'pandas':
        df = pd.read_csv(file_name)

    elif library_name.lower() == 'vaex':
        df = vx.read_csv(file_name)

    elif library_name.lower() == 'datatable':
        df = dt.fread(file_name)

    else:
        raise ValueError("Invalid library name. Must be 'polars', 'pandas', 'vaex', or 'datatable'")

    return library_name

def group_data(library_name, df, column_name='Pregnancies'):

 if library_name.lower() == 'polars':
     df_grouped = df.group_by(column_name).first()

 elif library_name.lower() == 'vaex':
     df_grouped = df.groupby(column_name, agg='first')

 elif library_name.lower() == 'pandas':
     df_grouped = df.groupby(column_name).first()

 elif library_name.lower() == 'datatable':
     df_grouped = df[:, dt.first(dt.f[:]), dt.by(column_name)]

 else:
     raise ValueError("Invalid library name. Must be 'polars', 'vaex', or 'datatable'")

 return library_name

def sort_data(library_name, df, column_name='Pregnancies'):

    if library_name.lower() == 'polars':
        df_sorted = df.sort(column_name)
    elif library_name.lower() == 'vaex':
        df_sorted = df.sort(column_name)

    elif library_name.lower() == 'datatable':
        df_sorted = df.sort(column_name)

    elif library_name.lower() == 'pandas':
        df_sorted = pd.DataFrame(df).sort_values(column_name)
    else:
        raise ValueError("Invalid library name. Must be 'polars', 'vaex', 'datatable', or 'pandas'")

    return library_name

def offload_data(library_name, df, column_name='Pregnancies'):

    if library_name.lower() == 'polars':
        array = df.to_numpy()
        
    elif library_name.lower() == 'vaex':
        array = df.to_pandas_df().values

    elif library_name.lower() == 'datatable':
        array = df.to_numpy()
        
    elif library_name.lower() == 'pandas':
        array = pd.DataFrame(df).values
    else:
        raise ValueError("Invalid library name. Must be 'polars', 'vaex', 'datatable', or 'pandas'")

    return library_name

In [5]:
def measure_time(func, *args, **kwargs):
    start_time = time.time()

    # Executar a função com os argumentos fornecidos
    result = func(*args, **kwargs)

    end_time = time.time()
    execution_time = end_time - start_time

    return result, execution_time

def measure_cpu(func, *args, **kwargs):
    process = psutil.Process()
    
    # Iniciar a medição de CPU
    cpu_start = process.cpu_percent(interval=None)

    # Executar a função com os argumentos fornecidos
    result = func(*args, **kwargs)

    # Medir uso de CPU após a execução
    cpu_end = process.cpu_percent(interval=None)
    cpu_consumption = (cpu_end - cpu_start) / psutil.cpu_count()

    return result, cpu_consumption

def measure_memory(func, *args, **kwargs):
    # Iniciar rastreamento de memória
    tm.start()

    # Executar a função com os argumentos fornecidos
    result = func(*args, **kwargs)

    # Capturar uso de memória
    current_memory, peak_memory = tm.get_traced_memory()

    # Parar o rastreamento de memória
    tm.stop()

    # Convertendo para MB
    current_memory_mb = current_memory/1024/1024
    peak_memory_mb = peak_memory/1024/1024

    return result, current_memory_mb, peak_memory_mb



In [6]:
#calculando o consumo de tempo para cada biblioteca
performance_dfs = [pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame()]
functions = [read_csv, group_data, sort_data, offload_data]
operations = ["read", "group", "sort", "offload"]
libraries = ['polars', 'pandas', 'vaex', 'datatable']

pandas_df = get_df('pandas', file_name)
polars_df = get_df('polars', file_name)
vaex_df = get_df('vaex', file_name)
dt_df = get_df('datatable', file_name)

#calculando o consumo de tempo para cada biblioteca
for function,op in zip(functions,operations):
    temp_measure = []
    for library, df in zip(libraries,[polars_df,pandas_df,vaex_df,dt_df]):
        result, execution_time = measure_time(function, library, df)
        temp_measure.append(execution_time)
        
    if performance_dfs[0].empty:
        performance_dfs[0] = pd.DataFrame({"library": libraries, f"{op}_time": temp_measure})
    else:
        performance_dfs[0] = pd.merge(performance_dfs[0],pd.DataFrame({"library": libraries, f"{op}_time": temp_measure}), on="library", how="left")

In [7]:
performance_dfs[0]

Unnamed: 0,library,read_time,group_time,sort_time,offload_time
0,polars,5.17035,1.343631,14.604167,6.088951
1,pandas,34.560065,5.457632,12.116139,4.073132
2,vaex,30.448416,1.893596,1.135354,18.091531
3,datatable,3.089096,0.898707,0.417167,2.905302


In [8]:
# calculando o consumo de CPU para cada biblioteca
for function,op in zip(functions,operations):
    temp_measure = []
    for library, df in zip(libraries,[polars_df,pandas_df,vaex_df,dt_df]):
        result, cpu_consumption = measure_cpu(function, library, df)
        temp_measure.append(cpu_consumption)
    if performance_dfs[1].empty:
        performance_dfs[1] = pd.DataFrame({"library": libraries, f"{op}_cpu":temp_measure})
    else:
        performance_dfs[1] = pd.merge(performance_dfs[1],pd.DataFrame({"library": libraries, f"{op}_cpu":temp_measure}), on="library", how="left")

# calculando o consumo de memória para cada biblioteca
for function,op in zip(functions,operations):
    temp_measure = []
    temp_measure2 = []
    for library, df in zip(libraries,[polars_df,pandas_df,vaex_df,dt_df]):
        result, current_memory_mb, peak_memory_mb = measure_memory(function, library, df)
        temp_measure.append(current_memory_mb)
        temp_measure2.append(peak_memory_mb)
    if performance_dfs[2].empty:
        performance_dfs[2] = pd.DataFrame({"library": libraries, f"{op}_memory":temp_measure, f"{op}_peak":temp_measure2})
    else:
        performance_dfs[2] = pd.merge(performance_dfs[2],pd.DataFrame({"library": libraries, f"{op}_memory":temp_measure, f"{op}_peak":temp_measure2}), on="library", how="left")

In [9]:
performance_dfs[0].rename(columns={"library":"library","read_time":"read_execution_time (s)","group_time":"group_execution_time (s)","sort_time":"sort_execution_time (s)","offload_time":"offload_execution_time (s)"},inplace=True)
performance_dfs[1].rename(columns={"library":"library","read_cpu":"read_cpu_usage (%)","group_cpu":"group_cpu_usage (%)","sort_cpu":"sort_cpu_usage (%)","offload_cpu":"offload_cpu_usage (%)"},inplace=True)
performance_dfs[2].rename(columns={"library":"library","read_memory":"read_memory_usage (MB)","read_peak":"read_memory_peak (MB)","group_memory":"group_memory_usage (MB)","group_peak":"group_memory_peak (MB)","sort_memory":"sort_memory_usage (MB)","sort_peak":"sort_memory_peak (MB)","offload_memory":"offload_memory_usage (MB)","offload_peak":"offload_memory_peak (MB)"},inplace=True)
performance_df = pd.merge(performance_dfs[0],performance_dfs[1], on="library", how="left").merge(performance_dfs[2], on="library", how="left")
performance_df["df_lenght"] = pandas_df.shape[0]

In [10]:
# Função para plotar o gráfico de uso de memória
def plot_memory_usage(performance_df, metric, graph_title='Memory Usage by Library'):
    if "memory_usage" in metric:
        metric2 = metric.replace("memory_usage", "memory_peak")
    df = performance_df[['library', metric, metric2]]
    
    # Transformar o DataFrame para ser adequado a um gráfico de barras agrupadas
    df_melted = df.melt(id_vars='library', var_name='memory_type', value_name='memory')

    # Criar o gráfico de barras agrupadas
    fig = px.bar(df_melted, x='library', y='memory', color='memory_type', barmode='group',
                labels={'memory': 'Memory Usage (MB)', 'library': 'Library', 'memory_type': 'Memory Type'},
                title=graph_title)
    
    # Configura o eixo Y para escala logarítmica
    fig.update_layout(yaxis_type="log")
    return fig

In [11]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px

# Função para criar o gráfico com base na métrica selecionada
def create_figure(metric, performance_df):
    # Tratamento especial para métricas de uso de memória
    if 'memory_usage' in metric or 'memory_peak' in metric:
        return plot_memory_usage(performance_df, metric)  # Chama a função de uso de memória específica
    elif metric in performance_df.columns:
        data = performance_df[['library', metric]]
        df = pd.DataFrame({"library": data['library'], "value": data[metric]})
        
        # Definindo o título baseado no tipo de métrica selecionada
        if 'time' in metric:
            title = f'{metric.capitalize()} Comparison (seconds)'
        elif 'cpu' in metric:
            title = f'{metric.capitalize()} Comparison (%)'
        else:
            title = f'{metric.capitalize()} Comparison'
        
        # Criando gráfico de barras
        fig = px.bar(df, x='library', y='value', title=title)
        return fig
    return {}

# Inicializar o app Dash
app = dash.Dash(__name__)

# Layout do app
app.layout = html.Div([
    html.H1("Performance Comparison (Execution Time, CPU Usage, Memory Usage)"),
    dcc.Dropdown(
        id='metric-dropdown',
        options=[
            {'label': 'Read Execution Time (s)', 'value': 'read_execution_time (s)'},
            {'label': 'Group Execution Time (s)', 'value': 'group_execution_time (s)'},
            {'label': 'Sort Execution Time (s)', 'value': 'sort_execution_time (s)'},
            {'label': 'Offload Execution Time (s)', 'value': 'offload_execution_time (s)'},
            {'label': 'Read CPU Usage (%)', 'value': 'read_cpu_usage (%)'},
            {'label': 'Group CPU Usage (%)', 'value': 'group_cpu_usage (%)'},
            {'label': 'Sort CPU Usage (%)', 'value': 'sort_cpu_usage (%)'},
            {'label': 'Offload CPU Usage (%)', 'value': 'offload_cpu_usage (%)'},
            {'label': 'Read Memory Usage (MB)', 'value': 'read_memory_usage (MB)'},
            {'label': 'Group Memory Usage (MB)', 'value': 'group_memory_usage (MB)'},
            {'label': 'Sort Memory Usage (MB)', 'value': 'sort_memory_usage (MB)'},
            {'label': 'Offload Memory Usage (MB)', 'value': 'offload_memory_usage (MB)'},
        ],
        value='read_execution_time (s)',  # Valor padrão
        clearable=False
    ),
    dcc.Graph(id='performance-graph')
])

# Callback para atualizar o gráfico com base na métrica selecionada
@app.callback(
    Output('performance-graph', 'figure'),
    [Input('metric-dropdown', 'value')]
)
def update_graph(selected_metric):
    return create_figure(selected_metric, performance_df)

# Rodar o app
if __name__ == '__main__':
    app.run_server(debug=True, port=8060)


In [12]:
performance_df.to_csv(f"performance_data_dup{dup_times}.csv", index=False)