# Benchmark Pandas vs Cudf
- Using *timeit*

### System details

#### GPU

In [31]:
!nvidia-smi -q



Timestamp                           : Thu Jul  4 06:05:46 2019
Driver Version                      : 418.56
CUDA Version                        : 10.1

Attached GPUs                       : 1
GPU 00000000:0B:00.0
    Product Name                    : Tesla V100-SXM2-32GB
    Product Brand                   : Tesla
    Display Mode                    : Enabled
    Display Active                  : Disabled
    Persistence Mode                : Disabled
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 4000
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : 0321018116769
    GPU UUID                        : GPU-a2891a13-c0c9-5148-3cfd-b103c41a059b
    Minor Number                    : 3
    VBIOS Version                   : 88.00.80.00.04
    MultiGPU Board                  : No
    Board ID                        : 0xb00
    GPU Part Number                 : 692-

#### CPU

In [32]:
!less /proc/cpuinfo

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 79
model name      : Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz
stepping        : 1
microcode       : 0xb00002e
cpu MHz         : 2909.843
cache size      : 51200 KB
physical id     : 0
siblings        : 40
core id         : 0
cpu cores       : 20
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 20
wp              : yes
[K:[K

## Preperations
- Imports
- Variables

In [3]:
import os

In [4]:
import cudf
import pandas as pd

In [5]:
# Source files
raw_parquet_file = 'logs_sample/haproxy_logs_10k.parquet'
raw_json_file = 'logs_sample/haproxy_json_logs_raw_original.txt'

# Benchmark configuration
nlargest = 300

## Target file size validation
Set target size (in MB) for the test file

In [6]:
# Source to be used
source_file = raw_json_file 

# Target file for multiplication
target_file = 'logs_sample/1gb.file' 

In [7]:
target_test_size_in_mb = 1000

In [8]:
multiplication_factor = target_test_size_in_mb // (os.path.getsize(source_file) >> 20)
print(f'Multiply target file by {multiplication_factor}')

Multiply target file by 200


In [None]:
with open(source_file, 'r') as source:
    with open(target_file, 'w') as target:
        [target.write(line) 
         for line in source.readlines() 
         for m in range(multiplication_factor)]

In [None]:
!ls -lah logs_sample/

## Benchmark

### Flow
- Read file
- Compute aggregations
- get nlargest()

In [9]:
benchmark_file = target_file

#### cuDF

In [10]:
%%timeit

# Read file
# gdf = cudf.read_parquet(benchmark_file)
gdf = cudf.read_json(benchmark_file, lines=True)

# Perform aggregation
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })

# Select top N
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

1.36 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [11]:
%%timeit

# Read file
# gdf = pd.read_parquet(benchmark_file)
gdf = pd.read_json(benchmark_file, lines=True)

# Perform aggregation
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })

# Select top N
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

14.2 s ± 97.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test loading times

#### cudf

In [12]:
%%timeit
# gdf = cudf.read_parquet(benchmark_file)
gdf = cudf.read_json(benchmark_file, lines=True)

1.12 s ± 8.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [13]:
%%timeit
# gdf = pd.read_parquet(benchmark_file)
gdf = pd.read_json(benchmark_file, lines=True)

14 s ± 1.22 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test aggregation

In [14]:
# gdf = cudf.read_parquet(benchmark_file)
gdf = cudf.read_json(benchmark_file, lines=True)

# pdf = pd.read_parquet(benchmark_file)
pdf = pd.read_json(benchmark_file, lines=True)

#### cudf

In [15]:
%%timeit
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

211 ms ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [16]:
%%timeit
pdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = pdf.nlargest(nlargest, 'time_backend_response')

789 ms ± 44.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
