# Setup

In [2]:
XET_REPO = "xet_repo"
XET_REMOTE = "https://xethub.com/xdssio/xethub-git" # Change
S3_BUCKET = "s3://versioning-article/s3/" # Change

In [29]:
import numpy as np
import pandas as pd
import os
import os.path
import subprocess
import logging
import time
from tqdm import tqdm

np.random.seed(1)


class NumericDataGenerator:

    def __init__(self, cols: int=10):
        self.cols = cols
        self.columns = [f"col{i}"  for i in range(self.cols)]

    def generate(self, rows: int = None):
        data = np.random.rand(rows, self.cols)
        return pd.DataFrame(data, columns=self.columns)
    
    def append(self, filename: str, rows: int):
        data = np.random.rand(rows, self.cols)
        df = pd.DataFrame(data, columns=self.columns)
        if os.path.exists(filename) and filename.endswith('.parquet'):
            former = pd.read_parquet(filename)
            df = pd.concat([former, df])
            df.to_parquet(filename, engine='pyarrow')
        else:
            df.to_csv(filename, mode='a', header=False, index=False)
            
    def export(self, df, filepath):
        if filepath.endswith('.parquet'):
            df.to_parquet(filepath, engine='pyarrow')
        else:
            df.to_csv(filepath, index=False)

def run(command:str, cwd:str):
    out = subprocess.run(command, shell=True, capture_output=True, cwd=cwd).stdout
    if out:
        out = out.decode()
    print(out)
    return out



def git_upload(filename: str, cwd:str):
    filename = os.path.basename(filepath)
    command = f"""                                
            git add {filename}
            git commit -m "upload"
            git push
            """
    return run(command, cwd)


def s3_upload(filepath: str, cwd:str):
    command = f"aws s3 cp {filepath} {S3_BUCKET}{os.path.basename(filepath)}"
    return run(command, cwd)

# Experiment

In [30]:
start_rows = 6000000 # 1G
n_rows_add = 6000 # 1MB
start_rows = 100
n_rows_add = 1

columns = 10
local_path = os.getcwd()
filename = 'data.txt'
xet_repo_path = os.path.join(local_path, XET_REPO)
filepath = os.path.join(xet_repo_path, filename)
iterations = 2

generator = NumericDataGenerator(cols=columns)
df = generator.generate(start_rows)
generator.export(df, filepath)

In [32]:
for iteration in tqdm(range(iterations)):
    start_time = time.time()
    git_upload(filepath, XET_REPO)
    print(f"gitxet upload time: {time.time()-start_time}")
    
    start_time = time.time()
    s3_upload(filepath, XET_REPO)
    print(f"S3 upload time: {time.time()-start_time}")
    
    generator.append(filename, n_rows_add)
    

  0%|                                                                                                                                         | 0/2 [00:00<?, ?it/s]

gitxet upload time - 9.5367431640625e-07


 50%|████████████████████████████████████████████████████████████████▌                                                                | 1/2 [00:01<00:01,  1.75s/it]

Completed 18.9 KiB/18.9 KiB (16.2 KiB/s) with 1 file(s) remainingupload: ./data.txt to s3://versioning-article/s3/data.txt        

S3 upload time - 1.7506687641143799
gitxet upload time - 4.0531158447265625e-06


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.74s/it]

Completed 18.9 KiB/18.9 KiB (17.2 KiB/s) with 1 file(s) remainingupload: ./data.txt to s3://versioning-article/s3/data.txt        

S3 upload time - 1.724302053451538



