### Compare performance of Arrow, CSV and Parquet

In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np
import os
import psutil

##### 1. Load and prepare data

In [11]:
# Read covid data from github
df = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316650,ZWE,Africa,Zimbabwe,2023-06-03,265139.0,32.0,21.000,5695.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316651,ZWE,Africa,Zimbabwe,2023-06-04,265139.0,0.0,16.571,5695.0,0.0,0.286,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316652,ZWE,Africa,Zimbabwe,2023-06-05,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316653,ZWE,Africa,Zimbabwe,2023-06-06,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [16]:
df_copy = df.copy()
df_copy

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316650,ZWE,Africa,Zimbabwe,2023-06-03,265139.0,32.0,21.000,5695.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316651,ZWE,Africa,Zimbabwe,2023-06-04,265139.0,0.0,16.571,5695.0,0.0,0.286,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316652,ZWE,Africa,Zimbabwe,2023-06-05,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316653,ZWE,Africa,Zimbabwe,2023-06-06,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [24]:
# show all object columns
df_copy.select_dtypes(include=['object']).columns

Index(['iso_code', 'continent', 'location', 'date', 'tests_units'], dtype='object')

In [25]:
# convert object columns to category
for col in df_copy.select_dtypes(include=['object']).columns:
    df_copy[col] = df_copy[col].astype('category')

# show all object columns
df_copy.select_dtypes(include=['object']).columns

Index([], dtype='object')

In [26]:
# show all category columns
df_copy.select_dtypes(include=['category']).columns

Index(['iso_code', 'continent', 'location', 'date', 'tests_units'], dtype='object')

In [27]:
# null values in category columns
df_copy.select_dtypes(include=['category']).isnull().sum()

iso_code            0
continent       15042
location            0
date                0
tests_units    209867
dtype: int64

In [28]:
# sample category column with null values
df_copy['continent'].sample(10)

94610      Europe
5102      Oceania
162268     Europe
165543     Africa
212884       Asia
276852       Asia
36710      Africa
195513    Oceania
226168     Europe
199074     Africa
Name: continent, dtype: category
Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [29]:
# sample rows with null values
df_copy[df_copy['continent'].isnull()].sample(10)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
88578,OWID_EUR,,Europe,2022-10-09,230099816.0,193076.0,275993.143,1952773.0,517.0,715.286,...,,,,,,744807800.0,,,,
88396,OWID_EUR,,Europe,2022-04-10,184860574.0,426109.0,529382.714,1826630.0,1197.0,1469.714,...,,,,,,744807800.0,,,,
263237,OWID_SAM,,South America,2020-10-15,8824805.0,38808.0,49087.429,333642.0,1120.0,1355.714,...,,,,,,436816700.0,,,,
203350,OWID_NAM,,North America,2022-12-06,117380642.0,3775.0,51820.714,1536035.0,18.0,318.429,...,,,,,,600323700.0,,,,
15221,OWID_ASI,,Asia,2020-07-13,2972365.0,56510.0,54942.0,70386.0,1352.0,1148.286,...,,,,,,4721383000.0,,,,
15440,OWID_ASI,,Asia,2021-02-17,24223499.0,68105.0,64868.0,389139.0,838.0,858.857,...,,,,,,4721383000.0,,,,
160257,OWID_LMC,,Lower middle income,2020-04-06,80928.0,4569.0,4634.571,4743.0,271.0,244.571,...,,,,,,3432097000.0,,,,
263249,OWID_SAM,,South America,2020-10-27,9441337.0,41688.0,50875.143,348723.0,912.0,1186.857,...,,,,,,436816700.0,,,,
203438,OWID_NAM,,North America,2023-03-04,122820239.0,240873.0,38359.429,1583160.0,2374.0,360.286,...,,,,,,600323700.0,,,,
15833,OWID_ASI,,Asia,2022-03-17,130404440.0,947192.0,871940.571,1384496.0,1743.0,1742.143,...,,,,,,4721383000.0,,,,


In [30]:
# select distinct values from the location column
df_copy['location'].unique()

['Afghanistan', 'Africa', 'Albania', 'Algeria', 'American Samoa', ..., 'Western Sahara', 'World', 'Yemen', 'Zambia', 'Zimbabwe']
Length: 255
Categories (255, object): ['Afghanistan', 'Africa', 'Albania', 'Algeria', ..., 'World', 'Yemen', 'Zambia', 'Zimbabwe']

In [31]:
# select distinct values from the continent column
df_copy['continent'].unique()

['Asia', NaN, 'Europe', 'Africa', 'Oceania', 'North America', 'South America']
Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [32]:
# select distinct values from the tests_units column
df_copy['tests_units'].unique()

[NaN, 'tests performed', 'units unclear', 'samples tested', 'people tested']
Categories (4, object): ['people tested', 'samples tested', 'tests performed', 'units unclear']

In [None]:
# drop test_units and continent columns
df_copy.drop(['tests_units', 'continent'], axis=1, inplace=True)

In [35]:
df_copy

Unnamed: 0,iso_code,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Afghanistan,2020-01-03,,0.0,,,0.0,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Afghanistan,2020-01-04,,0.0,,,0.0,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Afghanistan,2020-01-05,,0.0,,,0.0,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Afghanistan,2020-01-06,,0.0,,,0.0,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Afghanistan,2020-01-07,,0.0,,,0.0,,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316650,ZWE,Zimbabwe,2023-06-03,265139.0,32.0,21.000,5695.0,0.0,0.429,16245.726,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316651,ZWE,Zimbabwe,2023-06-04,265139.0,0.0,16.571,5695.0,0.0,0.286,16245.726,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316652,ZWE,Zimbabwe,2023-06-05,265139.0,0.0,15.571,5695.0,0.0,0.000,16245.726,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316653,ZWE,Zimbabwe,2023-06-06,265139.0,0.0,15.571,5695.0,0.0,0.000,16245.726,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [None]:
# convert object columns to category
for col in df_copy.select_dtypes(include=['object']).columns:
    df_copy[col] = df_copy[col].astype('category')

# show all object columns
df_copy.select_dtypes(include=['object']).columns

# show all category columns


In [36]:
# show data types
df_copy.dtypes


iso_code                                   category
location                                   category
date                                       category
total_cases                                 float64
new_cases                                   float64
                                             ...   
population                                  float64
excess_mortality_cumulative_absolute        float64
excess_mortality_cumulative                 float64
excess_mortality                            float64
excess_mortality_cumulative_per_million     float64
Length: 65, dtype: object

In [38]:
# increase dataset to 1 million rows and reset index
df_copy = df_copy.sample(n=1000000, replace=True).reset_index(drop=True)
df_copy

Unnamed: 0,iso_code,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,TCA,Turks and Caicos Islands,2022-01-07,3562.0,0.0,39.857,26.0,0.0,0.000,77898.788,...,,,,80.22,,45726.0,,,,
1,JPN,Japan,2020-12-07,162067.0,1969.0,2186.714,2335.0,20.0,30.857,1307.501,...,33.7,,13.05,84.63,0.919,123951696.0,,,,
2,PHL,Philippines,2021-09-06,2103296.0,22380.0,18180.857,34337.0,103.0,143.857,18201.056,...,40.8,78.463,1.00,71.23,0.718,115559008.0,,,,
3,CAN,Canada,2022-11-13,4380999.0,0.0,2627.857,46537.0,0.0,49.143,113927.332,...,16.6,,2.50,82.43,0.929,38454328.0,44146.23,5.13,7.94,1148.0172
4,BMU,Bermuda,2022-11-11,18428.0,0.0,0.000,149.0,0.0,0.000,287009.205,...,,,,82.59,,64207.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,NZL,New Zealand,2020-06-17,1156.0,0.0,0.286,22.0,0.0,0.000,222.938,...,17.2,,2.61,82.29,0.931,5185289.0,,,,
999996,POL,Poland,2021-01-19,1446149.0,4835.0,6836.714,33777.0,291.0,300.286,36283.307,...,33.1,,6.62,78.73,0.880,39857144.0,,,,
999997,MRT,Mauritania,2023-01-21,63435.0,0.0,0.000,997.0,0.0,0.000,13393.802,...,,15.950,,64.92,0.546,4736146.0,,,,
999998,PER,Peru,2022-10-03,4145048.0,632.0,596.286,216578.0,3.0,15.000,121735.629,...,,,1.60,76.74,0.777,34049588.0,,,,


In [40]:
# update sample number column with index
df_copy["Sample Number"] = df_copy.index
df_copy

Unnamed: 0,iso_code,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million,Sample Number
0,TCA,Turks and Caicos Islands,2022-01-07,3562.0,0.0,39.857,26.0,0.0,0.000,77898.788,...,,,80.22,,45726.0,,,,,0
1,JPN,Japan,2020-12-07,162067.0,1969.0,2186.714,2335.0,20.0,30.857,1307.501,...,,13.05,84.63,0.919,123951696.0,,,,,1
2,PHL,Philippines,2021-09-06,2103296.0,22380.0,18180.857,34337.0,103.0,143.857,18201.056,...,78.463,1.00,71.23,0.718,115559008.0,,,,,2
3,CAN,Canada,2022-11-13,4380999.0,0.0,2627.857,46537.0,0.0,49.143,113927.332,...,,2.50,82.43,0.929,38454328.0,44146.23,5.13,7.94,1148.0172,3
4,BMU,Bermuda,2022-11-11,18428.0,0.0,0.000,149.0,0.0,0.000,287009.205,...,,,82.59,,64207.0,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,NZL,New Zealand,2020-06-17,1156.0,0.0,0.286,22.0,0.0,0.000,222.938,...,,2.61,82.29,0.931,5185289.0,,,,,999995
999996,POL,Poland,2021-01-19,1446149.0,4835.0,6836.714,33777.0,291.0,300.286,36283.307,...,,6.62,78.73,0.880,39857144.0,,,,,999996
999997,MRT,Mauritania,2023-01-21,63435.0,0.0,0.000,997.0,0.0,0.000,13393.802,...,15.950,,64.92,0.546,4736146.0,,,,,999997
999998,PER,Peru,2022-10-03,4145048.0,632.0,596.286,216578.0,3.0,15.000,121735.629,...,,1.60,76.74,0.777,34049588.0,,,,,999998


In [41]:
# show missing values in dataset
df_copy.isnull().sum()


iso_code                                        0
location                                        0
date                                            0
total_cases                                114876
new_cases                                   27827
                                            ...  
excess_mortality_cumulative_absolute       965261
excess_mortality_cumulative                965261
excess_mortality                           965261
excess_mortality_cumulative_per_million    965261
Sample Number                                   0
Length: 66, dtype: int64

In [42]:
# fill missing values with 0 for all non-category columns
for col in df_copy.select_dtypes(exclude=['category']).columns:
    df_copy[col] = df_copy[col].fillna(0)

In [43]:
# show missing values in dataset
df_copy.isnull().sum()

iso_code                                   0
location                                   0
date                                       0
total_cases                                0
new_cases                                  0
                                          ..
excess_mortality_cumulative_absolute       0
excess_mortality_cumulative                0
excess_mortality                           0
excess_mortality_cumulative_per_million    0
Sample Number                              0
Length: 66, dtype: int64

In [44]:
df_copy

Unnamed: 0,iso_code,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,...,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million,Sample Number
0,TCA,Turks and Caicos Islands,2022-01-07,3562.0,0.0,39.857,26.0,0.0,0.000,77898.788,...,0.000,0.00,80.22,0.000,45726.0,0.00,0.00,0.00,0.0000,0
1,JPN,Japan,2020-12-07,162067.0,1969.0,2186.714,2335.0,20.0,30.857,1307.501,...,0.000,13.05,84.63,0.919,123951696.0,0.00,0.00,0.00,0.0000,1
2,PHL,Philippines,2021-09-06,2103296.0,22380.0,18180.857,34337.0,103.0,143.857,18201.056,...,78.463,1.00,71.23,0.718,115559008.0,0.00,0.00,0.00,0.0000,2
3,CAN,Canada,2022-11-13,4380999.0,0.0,2627.857,46537.0,0.0,49.143,113927.332,...,0.000,2.50,82.43,0.929,38454328.0,44146.23,5.13,7.94,1148.0172,3
4,BMU,Bermuda,2022-11-11,18428.0,0.0,0.000,149.0,0.0,0.000,287009.205,...,0.000,0.00,82.59,0.000,64207.0,0.00,0.00,0.00,0.0000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,NZL,New Zealand,2020-06-17,1156.0,0.0,0.286,22.0,0.0,0.000,222.938,...,0.000,2.61,82.29,0.931,5185289.0,0.00,0.00,0.00,0.0000,999995
999996,POL,Poland,2021-01-19,1446149.0,4835.0,6836.714,33777.0,291.0,300.286,36283.307,...,0.000,6.62,78.73,0.880,39857144.0,0.00,0.00,0.00,0.0000,999996
999997,MRT,Mauritania,2023-01-21,63435.0,0.0,0.000,997.0,0.0,0.000,13393.802,...,15.950,0.00,64.92,0.546,4736146.0,0.00,0.00,0.00,0.0000,999997
999998,PER,Peru,2022-10-03,4145048.0,632.0,596.286,216578.0,3.0,15.000,121735.629,...,0.000,1.60,76.74,0.777,34049588.0,0.00,0.00,0.00,0.0000,999998


#### 1.2 Write to disk

In [45]:
# write to csv
df_copy.to_csv("../data/covid-data.csv")

In [46]:
# write to parquet
df.to_parquet("../data/covid-data.parquet")

In [None]:
# write to arrow
# convert from pandas to arrow
table = pa.Table.from_pandas(df_copy)
# write to arrow file
pq.write_table(table, '../data/covid-data.arrow')