Some more information on the tarfile library can be found here:
https://docs.python.org/3/library/tarfile.html

Information on using pyarrow to manipulate parquet files
https://arrow.apache.org/docs/python/parquet.html#reading-parquet-and-memory-mapping

Information on reading parquet files with pandas
https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html

In [1]:
import tarfile
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
#Load tar.gz file into Python

tar = tarfile.open("hitdata7days_0.tar.gz","r:gz")

In [3]:
#Extract all parquet files into computer memory

tar.extractall()

In [21]:
tar = tarfile.open("hitdata7days_0.tar.gz","r:gz")

In [22]:
#Extracting filename from TarInfo object

tn = tar.next()
tn.name

'hitdata7days/visitday=10'

In [23]:
#Check if TarInfo is a file or a directory

print(tn.isreg())
print(tn.isdir())

False
True


In [24]:
#Reading info from parquet file
tn = tar.next()
pq.read_schema(tn.name)

hitdatahistorymkey: int64
filename: string
linenumber: int32
brandcode: string
visitoridhigh: decimal128(20, 0)
visitoridlow: decimal128(20, 0)
visitnumber: decimal128(10, 0)
visitdatetime: timestamp[ns]
visitdate: date32[day]
visitmonth: int32
fiscalyear: int16
fiscalmonthnumber: int16
fiscalweeknumber: int16
hit_time_gmt: decimal128(11, 0)
service: string
acceptlanguage: string
eventlist: string
homepage: string
ip: string
pageevent: decimal128(3, 0)
pageeventvar1: string
pageeventvar2: string
pagetype: string
pageurl: string
pagename: string
productlist: string
userserver: string
channel: string
sitesection: string
category: string
subcategory: string
subcategory2: string
pagecategory: string
searchterms: string
searchresults: string
refinementtype: string
refinementattribute: string
myaccountengagement: string
formanalysis: string
emailsubscriptionadd: string
emailsubscriptionremove: string
linklocation: string
navigationlinks: string
searchtype: string
businessunit: string
categor

In [25]:
#Read in parquet file as pandas DataFrame

df = pd.read_parquet(tn.name)

# Visitor proportion by country

In [28]:
%%time
#Collect columns from all parquet files into a pandas DataFrame

columns = ['evar23','geocountry']
dfs = []
for member in tar:
    if member.isreg():
        df_temp = pd.read_parquet(member.name,columns = columns)
        dfs.append(df_temp)

df_all = pd.concat(dfs)

CPU times: user 15.5 s, sys: 760 ms, total: 16.2 s
Wall time: 16.2 s


In [29]:
%%time
#Don't do this! Collect all dataframes into list and then concatenate all at once!

columns = ['evar23','geocountry']
dfs = pd.DataFrame()
for member in tar:
    if member.isreg():
        df_temp = pd.read_parquet(member.name,columns = columns)
        dfs = pd.concat([dfs,df_temp])

CPU times: user 2.46 s, sys: 422 ms, total: 2.88 s
Wall time: 2.74 s


In [31]:
print("Number of site visits without user ID")
print(np.sum(df_all.evar23.isnull()))

Number of site visits without user ID
1604973


In [32]:
print("Number of site visits total")
print(len(df_all))

Number of site visits total
6458609


In [33]:
print("Proportion of site visits without user ID")
print(np.sum(df_all.evar23.isnull())/len(df_all))

Proportion of site visits without user ID
0.24850134138790567


In [34]:
print("Number of distinct user IDs")
print(len(df_all.evar23.unique())-1)

Number of distinct user IDs
142243


In [35]:
print("Number of distinct countries visited from")
print(len(df_all.geocountry.unique())-1)

Number of distinct countries visited from
44


In [36]:
#Which countries visited

print(df_all.geocountry.unique())

['usa' 'pri' 'mex' 'vir' 'bra' 'can' 'bhr' 'jam' 'ind' 'phl' 'col' 'nic'
 'isr' 'bhs' 'gtm' 'cym' 'dza' 'gum' 'chl' 'sen' 'bmu' 'pak' 'khm' 'tto'
 'grd' 'che' 'pan' 'tza' 'sxm' 'guy' 'gha' 'mrt' 'tca' 'brb' 'nor' 'sle'
 'jpn' 'gmb' 'bes' 'mnp' 'qat' 'are' 'tun' 'sgp' 'atg']


In [46]:
df_subset = df_all.drop_duplicates(subset=['evar23'])
df_subset = df_subset.iloc[1: , :]
df_subset.value_counts('geocountry') / len(df_subset.index)

# Statistics of types of users

In [51]:
%%time
#Collect columns from all parquet files into a pandas DataFrame

columns = ['evar23','checkoutthankyouflag', 'newvisit', 'dailyvisitor', 'hourlyvisitor', 'monthlyvisitor', 'yearlyvisitor']
dfs = []
for member in tar:
    if member.isreg():
        df_temp = pd.read_parquet(member.name,columns = columns)
        dfs.append(df_temp)

df_all = pd.concat(dfs)

CPU times: user 9.14 s, sys: 5.02 s, total: 14.2 s
Wall time: 15 s


In [55]:
df_subset = df_all.drop_duplicates(subset=['evar23'])
df_subset = df_subset.iloc[1: , :]

In [61]:
print("Proportion of unique users that checked out")
print(df_subset.value_counts('checkoutthankyouflag')[1] / len(df_subset.index))

Proportion of unique users that checked out
0.005835085030546318


In [63]:
print("Proportion of new visitors")
print(df_subset.value_counts('newvisit')[1] / len(df_subset.index))

print("Proportion of hourly visitors")
print(df_subset.value_counts('dailyvisitor')[1] / len(df_subset.index))

print("Proportion of daily visitors")
print(df_subset.value_counts('hourlyvisitor')[1] / len(df_subset.index))

print("Proportion of monthly visitors")
print(df_subset.value_counts('monthlyvisitor')[1] / len(df_subset.index))

print("Proportion of yearly visitors")
print(df_subset.value_counts('yearlyvisitor')[1] / len(df_subset.index))

Proportion of new visitors
0.1397327109242634
Proportion of hourly visitors
0.12440682494041887
Proportion of daily visitors
0.14440077894870046
Proportion of monthly visitors
0.01583909225761549
Proportion of yearly visitors
0.008309723501332227
