Benchmarking data

In [5]:
# Save the result as a CSV file for future use.
file_name = "benchmarking_data.csv"

In [17]:
import pandas as pd

data_URL = "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/diabetes.csv"

original_data = pd.read_csv(data_URL)

# Duplicate each row 100000 times
benchmarking_df = original_data.loc[original_data.index.repeat(100000)]

benchmarking_df.to_csv(file_name, index=False)

In [1]:
import pandas as pd
import polars as pl
import vaex as vx
import datatable as dt
import plotly.express as px
import time

Benchmarking Execution Time

In [2]:
def plot_metrics(list_exec_time, graph_title):

    [print(exec_time) for exec_time in list_exec_time]
    df = pd.DataFrame({"library": libraries, "execution_time": list_exec_time})

    # Plot bar plot using Plotly Express
    fig = px.bar(df, x='library', y='execution_time', title=graph_title)
    fig.show()

In [3]:
def read_csv_with_time(library_name, file_name):

    final_time = 0

    start_time = time.time()

    if library_name.lower() == 'polars':
        df = pl.read_csv(file_name)

    elif library_name.lower() == 'pandas':
        df = pd.read_csv(file_name)

    elif library_name.lower() == 'vaex':
        df = vx.read_csv(file_name)

    elif library_name.lower() == 'datatable':
        df = dt.fread(file_name)

    else:
        raise ValueError("Invalid library name. Must be 'polars', 'pandas', 'vaex', or 'datatable'")

    end_time = time.time()

    final_time = end_time - start_time

    return {"library": library_name, "execution_time": final_time, "df": df}

In [4]:
def group_data_with_time(library_name, df, column_name='Pregnancies'):

 start_time = time.time()

 if library_name.lower() == 'polars':
     df_grouped = df.group_by(column_name).first()

 elif library_name.lower() == 'vaex':
     df_grouped = df.groupby(column_name, agg='first')

 elif library_name.lower() == 'pandas':
     df_grouped = df.groupby(column_name).first()

 elif library_name.lower() == 'datatable':
     df_grouped = df[:, dt.first(dt.f[:]), dt.by(column_name)]

 else:
     raise ValueError("Invalid library name. Must be 'polars', 'vaex', or 'datatable'")

 end_time = time.time()

 final_time = end_time - start_time

 return {"library": library_name, "execution_time": final_time}

In [5]:
def sort_data_with_time(library_name, df, column_name='Pregnancies'):

    start_time = time.time()

    if library_name.lower() == 'polars':
        df_sorted = df.sort(column_name)
    elif library_name.lower() == 'vaex':
        df_sorted = df.sort(column_name)

    elif library_name.lower() == 'datatable':
        df_sorted = df.sort(column_name)

    elif library_name.lower() == 'pandas':
        df_sorted = pd.DataFrame(df).sort_values(column_name)
    else:
        raise ValueError("Invalid library name. Must be 'polars', 'vaex', 'datatable', or 'pandas'")

    end_time = time.time()

    final_time = end_time - start_time

    return {"library": library_name, "execution_time": final_time}

In [6]:
def offload_data_with_time(library_name, df):

    start_time = time.time()

    if library_name.lower() == 'polars':
        array = df.to_numpy()
        
    elif library_name.lower() == 'vaex':
        array = df.to_pandas_df().values

    elif library_name.lower() == 'datatable':
        array = df.to_numpy()
        
    elif library_name.lower() == 'pandas':
        array = pd.DataFrame(df).values
    else:
        raise ValueError("Invalid library name. Must be 'polars', 'vaex', 'datatable', or 'pandas'")

    end_time = time.time()

    final_time = end_time - start_time

    return {"library": library_name, "execution_time": final_time}

In [49]:
# Função para plotar o gráfico de uso de memória
def plot_memory_usage(performance_df, graph_title='Memory Usage by Library'):
    df = performance_df[['library', 'memory_usage', 'memory_peak']]
    
    # Transformar o DataFrame para ser adequado a um gráfico de barras agrupadas
    df_melted = df.melt(id_vars='library', var_name='memory_type', value_name='memory')

    # Criar o gráfico de barras agrupadas
    fig = px.bar(df_melted, x='library', y='memory', color='memory_type', barmode='group',
                labels={'memory': 'Memory Usage (MB)', 'library': 'Library', 'memory_type': 'Memory Type'},
                title=graph_title)
    
    # Configura o eixo Y para escala logarítmica
    fig.update_layout(yaxis_type="log")
    return fig

In [35]:
import tracemalloc as tm
import os

libraries = []
read_time = []
group_time = []
sort_time = []
offload_time = []
file_name = "benchmarking_data.csv"

result = read_csv_with_time('pandas', file_name)
temp, pandas_df = result['execution_time'], result['df']
read_time.append(temp)
libraries.append(result["library"])

result = read_csv_with_time('polars', file_name)
temp, polars_df = result['execution_time'], result['df']
libraries.append(result["library"])
read_time.append(temp)

result = read_csv_with_time('vaex', file_name)
temp, vaex_df = result['execution_time'], result['df']
libraries.append(result["library"])
read_time.append(temp)

result = read_csv_with_time('datatable', file_name)
temp, dt_df = result['execution_time'], result['df']
libraries.append(result["library"])
read_time.append(temp)

group_time.append(group_data_with_time(library_name='pandas', df = pandas_df)["execution_time"])
group_time.append(group_data_with_time('polars', polars_df)["execution_time"])
group_time.append(group_data_with_time('vaex', vaex_df)["execution_time"])
group_time.append(group_data_with_time('datatable', dt_df)["execution_time"])

sort_time.append(sort_data_with_time('pandas', pandas_df)["execution_time"])
sort_time.append(sort_data_with_time('polars', polars_df)["execution_time"])
sort_time.append(sort_data_with_time('vaex', vaex_df)["execution_time"])
sort_time.append(sort_data_with_time('datatable', dt_df)["execution_time"])

offload_time.append(offload_data_with_time('pandas', pandas_df)["execution_time"])
offload_time.append(offload_data_with_time('polars', polars_df)["execution_time"])
offload_time.append(offload_data_with_time('vaex', vaex_df)["execution_time"])
offload_time.append(offload_data_with_time('datatable', dt_df)["execution_time"])

list_memory_usage = []
# Iniciando o rastreamento de memória
for library, df in zip(libraries, [pandas_df, polars_df, vaex_df, dt_df]):
    # Inicializando o tracemalloc antes de medir o uso de memória
    tm.start()

    # Capturando o tempo de execução e o uso de memória
    time_consumption = offload_data_with_time(library, df)
    current, peak = tm.get_traced_memory()

    # Parando o rastreamento de memória
    tm.stop()

    # Armazenando o uso de memória em MB
    memory_usage = (current, peak)
    
    # Salvando os tempos de execução e os consumos de memória
    list_memory_usage.append(memory_usage)

memory_usage, memory_peak = list(zip(*list_memory_usage))
memory_usage = [mem / 10**6 for mem in memory_usage]
memory_peak = [mem / 10**6 for mem in memory_peak]

performance_df = pd.DataFrame({"library": libraries,
                               "df_lines": pandas_df.shape[0],
                               "read_time": read_time,
                               "group_time": group_time,
                               "sort_time": sort_time,
                               "offload_time": offload_time,
                               "memory_usage": memory_usage,
                               "memory_peak": memory_peak})

In [51]:
performance_df

Unnamed: 0,library,df_lines,read_time,group_time,sort_time,offload_time,memory_usage,memory_peak
0,pandas,76800000,41.87836,6.473221,14.876713,9.180316,0.304858,5539.926262
1,polars,76800000,5.334818,4.959993,13.24665,6.733349,0.298962,0.299098
2,vaex,76800000,47.311691,2.243328,1.436094,16.351222,0.61208,14141.535591
3,datatable,76800000,3.913777,0.742369,0.492761,2.33661,0.149046,0.149538


In [52]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px

# Função para criar o gráfico com base na métrica selecionada
def create_figure(metric, performance_df):
    if metric == 'memory_usage':
        return plot_memory_usage(performance_df)  # Chama a função de uso de memória
    elif metric in performance_df.columns and metric != 'memory_peak':
        data = performance_df[['library', metric]]
        df = pd.DataFrame({"library": data['library'], "execution_time": data[metric]})
        fig = px.bar(df, x='library', y='execution_time', title=f'{metric.capitalize()} Time Comparison')
        return fig
    return {}

# Inicializar o app Dash
app = dash.Dash(__name__)

# Layout do app
app.layout = html.Div([
    html.H1("Execution Time and Memory Usage Comparison"),
    dcc.Dropdown(
        id='metric-dropdown',
        options=[
            {'label': 'Read', 'value': 'read_time'},
            {'label': 'Grouping', 'value': 'group_time'},
            {'label': 'Sort', 'value': 'sort_time'},
            {'label': 'Offload', 'value': 'offload_time'},
            {'label': 'Memory Usage', 'value': 'memory_usage'},
        ],
        value='read_time',  # Valor padrão
        clearable=False
    ),
    dcc.Graph(id='execution-time-graph')
])

# Callback para atualizar o gráfico com base na métrica selecionada
@app.callback(
    Output('execution-time-graph', 'figure'),
    [Input('metric-dropdown', 'value')]
)
def update_graph(selected_metric):
    return create_figure(selected_metric, performance_df)

# Rodar o app
if __name__ == '__main__':
    app.run_server(debug=True)
