### Compare performance of Arrow, CSV and Parquet

In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np
import os
import psutil

##### 1. Load and prepare data

In [11]:
# Read covid data from github
df = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316650,ZWE,Africa,Zimbabwe,2023-06-03,265139.0,32.0,21.000,5695.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316651,ZWE,Africa,Zimbabwe,2023-06-04,265139.0,0.0,16.571,5695.0,0.0,0.286,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316652,ZWE,Africa,Zimbabwe,2023-06-05,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316653,ZWE,Africa,Zimbabwe,2023-06-06,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [16]:
df_copy = df.copy()
df_copy

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316650,ZWE,Africa,Zimbabwe,2023-06-03,265139.0,32.0,21.000,5695.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316651,ZWE,Africa,Zimbabwe,2023-06-04,265139.0,0.0,16.571,5695.0,0.0,0.286,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316652,ZWE,Africa,Zimbabwe,2023-06-05,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
316653,ZWE,Africa,Zimbabwe,2023-06-06,265139.0,0.0,15.571,5695.0,0.0,0.000,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [24]:
# show all object columns
df_copy.select_dtypes(include=['object']).columns

Index(['iso_code', 'continent', 'location', 'date', 'tests_units'], dtype='object')

In [25]:
# convert object columns to category
for col in df_copy.select_dtypes(include=['object']).columns:
    df_copy[col] = df_copy[col].astype('category')

# show all object columns
df_copy.select_dtypes(include=['object']).columns

Index([], dtype='object')

In [26]:
# show all category columns
df_copy.select_dtypes(include=['category']).columns

Index(['iso_code', 'continent', 'location', 'date', 'tests_units'], dtype='object')

In [None]:
# convert object columns to category
for col in df_copy.select_dtypes(include=['object']).columns:
    df_copy[col] = df_copy[col].astype('category')

# show all object columns
df_copy.select_dtypes(include=['object']).columns

# show all category columns


In [23]:
# show data types
df_copy.dtypes


iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                 float64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object

In [3]:
# increase dataset to 1 million rows and reset index
df = df.sample(n=1000000, replace=True).reset_index(drop=True)
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,HUN,Europe,Hungary,2022-01-12,1308877.0,7883.0,5556.857,40083.0,67.0,69.143,...,34.8,,7.02,76.88,0.854,9967304.0,,,,
1,LTU,Europe,Lithuania,2020-07-10,1796.0,7.0,5.143,62.0,0.0,0.000,...,38.0,,6.56,75.93,0.882,2750058.0,,,,
2,EST,Europe,Estonia,2023-01-01,612338.0,44.0,76.000,2853.0,0.0,2.286,...,39.3,,4.69,78.74,0.892,1326064.0,4739.699,10.03,30.86,3583.1782
3,HTI,North America,Haiti,2021-04-27,13101.0,5.0,12.000,254.0,0.0,0.000,...,23.1,22.863,0.70,64.00,0.510,11585003.0,,,,
4,PAN,North America,Panama,2022-08-05,954369.0,0.0,842.286,8425.0,0.0,1.857,...,9.9,,2.30,78.51,0.815,4408582.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,JPN,Asia,Japan,2020-03-16,814.0,34.0,46.429,27.0,5.0,2.429,...,33.7,,13.05,84.63,0.919,123951696.0,,,,
999996,AND,Europe,Andorra,2020-03-19,39.0,0.0,5.429,,0.0,0.000,...,37.8,,,83.73,0.868,79843.0,,,,
999997,BHS,North America,Bahamas,2022-11-08,37419.0,0.0,4.714,833.0,0.0,0.000,...,20.4,,2.90,73.92,0.814,409989.0,,,,
999998,LVA,Europe,Latvia,2022-09-16,913371.0,1369.0,1066.714,5969.0,4.0,1.714,...,51.0,,5.57,75.29,0.866,1850654.0,,,,


In [4]:
# update sample number column with index
df["Sample Number"] = df.index
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million,Sample Number
0,HUN,Europe,Hungary,2022-01-12,1308877.0,7883.0,5556.857,40083.0,67.0,69.143,...,,7.02,76.88,0.854,9967304.0,,,,,0
1,LTU,Europe,Lithuania,2020-07-10,1796.0,7.0,5.143,62.0,0.0,0.000,...,,6.56,75.93,0.882,2750058.0,,,,,1
2,EST,Europe,Estonia,2023-01-01,612338.0,44.0,76.000,2853.0,0.0,2.286,...,,4.69,78.74,0.892,1326064.0,4739.699,10.03,30.86,3583.1782,2
3,HTI,North America,Haiti,2021-04-27,13101.0,5.0,12.000,254.0,0.0,0.000,...,22.863,0.70,64.00,0.510,11585003.0,,,,,3
4,PAN,North America,Panama,2022-08-05,954369.0,0.0,842.286,8425.0,0.0,1.857,...,,2.30,78.51,0.815,4408582.0,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,JPN,Asia,Japan,2020-03-16,814.0,34.0,46.429,27.0,5.0,2.429,...,,13.05,84.63,0.919,123951696.0,,,,,999995
999996,AND,Europe,Andorra,2020-03-19,39.0,0.0,5.429,,0.0,0.000,...,,,83.73,0.868,79843.0,,,,,999996
999997,BHS,North America,Bahamas,2022-11-08,37419.0,0.0,4.714,833.0,0.0,0.000,...,,2.90,73.92,0.814,409989.0,,,,,999997
999998,LVA,Europe,Latvia,2022-09-16,913371.0,1369.0,1066.714,5969.0,4.0,1.714,...,,5.57,75.29,0.866,1850654.0,,,,,999998


In [5]:
# show missing values in dataset
df.isnull().sum()


iso_code                                        0
continent                                   47254
location                                        0
date                                            0
total_cases                                113780
                                            ...  
excess_mortality_cumulative_absolute       965193
excess_mortality_cumulative                965193
excess_mortality                           965193
excess_mortality_cumulative_per_million    965193
Sample Number                                   0
Length: 68, dtype: int64

In [6]:
# fill missing values with 0
df = df.fillna(0)

In [7]:
# show missing values in dataset
df.isnull().sum()

iso_code                                   0
continent                                  0
location                                   0
date                                       0
total_cases                                0
                                          ..
excess_mortality_cumulative_absolute       0
excess_mortality_cumulative                0
excess_mortality                           0
excess_mortality_cumulative_per_million    0
Sample Number                              0
Length: 68, dtype: int64

In [8]:
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million,Sample Number
0,HUN,Europe,Hungary,2022-01-12,1308877.0,7883.0,5556.857,40083.0,67.0,69.143,...,0.000,7.02,76.88,0.854,9967304.0,0.000,0.00,0.00,0.0000,0
1,LTU,Europe,Lithuania,2020-07-10,1796.0,7.0,5.143,62.0,0.0,0.000,...,0.000,6.56,75.93,0.882,2750058.0,0.000,0.00,0.00,0.0000,1
2,EST,Europe,Estonia,2023-01-01,612338.0,44.0,76.000,2853.0,0.0,2.286,...,0.000,4.69,78.74,0.892,1326064.0,4739.699,10.03,30.86,3583.1782,2
3,HTI,North America,Haiti,2021-04-27,13101.0,5.0,12.000,254.0,0.0,0.000,...,22.863,0.70,64.00,0.510,11585003.0,0.000,0.00,0.00,0.0000,3
4,PAN,North America,Panama,2022-08-05,954369.0,0.0,842.286,8425.0,0.0,1.857,...,0.000,2.30,78.51,0.815,4408582.0,0.000,0.00,0.00,0.0000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,JPN,Asia,Japan,2020-03-16,814.0,34.0,46.429,27.0,5.0,2.429,...,0.000,13.05,84.63,0.919,123951696.0,0.000,0.00,0.00,0.0000,999995
999996,AND,Europe,Andorra,2020-03-19,39.0,0.0,5.429,0.0,0.0,0.000,...,0.000,0.00,83.73,0.868,79843.0,0.000,0.00,0.00,0.0000,999996
999997,BHS,North America,Bahamas,2022-11-08,37419.0,0.0,4.714,833.0,0.0,0.000,...,0.000,2.90,73.92,0.814,409989.0,0.000,0.00,0.00,0.0000,999997
999998,LVA,Europe,Latvia,2022-09-16,913371.0,1369.0,1066.714,5969.0,4.0,1.714,...,0.000,5.57,75.29,0.866,1850654.0,0.000,0.00,0.00,0.0000,999998


#### 1.2 Write to disk

In [9]:
# write to csv
df.to_csv("../data/covid-data.csv")

In [10]:
# write to parquet
df.to_parquet("../data/covid-data.parquet")

ArrowTypeError: ("Expected bytes, got a 'int' object", 'Conversion failed for column continent with type object')