# Merge
This notebook merges all archive files and saves a zip compressed CSV file to be uploaded into Amazon S3.

In [1]:
import os
from glob import glob
import pandas as pd
from tqdm import tqdm

from appvoc.infrastructure.file.io import IOService

In [2]:
MINFILESIZE = 1e8
APPDATA_FILES = "data/appvoc/archive/appdata*.pkl"
REVIEW_FILES = "data/appvoc/archive/reviews*.pkl"
APPDATA_DESTINATION = "data/appdata/appdata.pkl"
APPDATA_GZIP = "data/appdata/appdata.csv.gzip"
REVIEWS_DESTINATION = "data/reviews/reviews.pkl"
REVIEWS_GZIP = "data/reviews/reviews.tsv.gzip"
print(MINFILESIZE)

100000000.0


In [3]:
def get_filenames(wildcard):
    files = []
    filelist = glob(wildcard)
    for file in filelist:
        if os.path.getsize(file) > MINFILESIZE:
            files.append(file)
    return sorted(files)

In [4]:
class Merger:
    def __init__(self) -> None:
        self._df = pd.DataFrame()
    def __call__(self, destination, filelist, key = 'id') -> None:
        for file in tqdm(filelist):
            newdf = IOService.read(file)
            newdf = newdf[["id","name","description","category_id","category","price","developer_id","developer","rating","ratings","released"]]
            self._df = pd.concat([self._df,newdf],axis=0)
            self._df = self._df.drop_duplicates(subset=[key], keep="last")
        IOService.write(filepath=destination, data = self._df)
merger = Merger()

In [5]:
def to_gzip(source, destination, sep=","):
    df = IOService.read(source)
    df.to_csv(destination, compression="gzip", sep=sep, index=False)

In [6]:
filelist = get_filenames(APPDATA_FILES)
merger(destination=APPDATA_DESTINATION, filelist=filelist)
to_gzip(source=APPDATA_DESTINATION, destination=APPDATA_GZIP)


100%|██████████| 47/47 [01:32<00:00,  1.97s/it]


In [7]:
df = IOService.read(APPDATA_DESTINATION)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 475132 entries, 12340 to 678136
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            475132 non-null  int64  
 1   name          475132 non-null  object 
 2   description   475132 non-null  object 
 3   category_id   475132 non-null  int64  
 4   category      475132 non-null  object 
 5   price         475132 non-null  float64
 6   developer_id  475132 non-null  int64  
 7   developer     475132 non-null  object 
 8   rating        475132 non-null  float64
 9   ratings       475132 non-null  int64  
 10  released      475132 non-null  object 
dtypes: float64(2), int64(4), object(5)
memory usage: 43.5+ MB


Unnamed: 0,id,name,description,category_id,category,price,developer_id,developer,rating,ratings,released
12340,6446790238,Mood Tracker :,Understanding and managing your emotions is cr...,6013,Health & Fitness,0.0,1436021751,CARECLINIC SOFTWARE INC.,5.0,6,2023-04-18 07:00:00
13007,6447916914,Empire App Virtual,"With the Empire App, you can:\n\n-join our gym...",6013,Health & Fitness,0.0,1603858924,Body Evolution Warfighter LLC,0.0,0,2023-04-21 07:00:00
13256,6448082493,CrossFit 926,For members of CrossFit 926 to reserve their p...,6013,Health & Fitness,0.0,688595778,PushPress,0.0,0,2023-04-21 07:00:00
13821,6447812886,Shoreline CrossFit,For members of Shoreline CrossFit to reserve t...,6013,Health & Fitness,0.0,688595778,PushPress,0.0,0,2023-04-20 07:00:00
14192,1631374974,myAxonics,Find relief from your bladder and bowel contro...,6013,Health & Fitness,0.0,1631374976,Axonics Modulation Technologies,0.0,0,2023-04-21 07:00:00
