# Setup

In [1]:
XET_REPO = "xet_repo"
XET_REMOTE = "https://xethub.com/xdssio/xethub-git" # Change
S3_BUCKET = "s3://versioning-article/s3/" # Change

In [35]:
import numpy as np
import pandas as pd
import os
import os.path
import subprocess
import logging
import time
from tqdm import tqdm
from xetrack import Tracker
import pyxet

np.random.seed(1)


class NumericDataGenerator:

    def __init__(self, cols: int=10):
        self.cols = cols
        self.columns = [f"col{i}"  for i in range(self.cols)]

    def generate(self, rows: int = None):
        data = np.random.rand(rows, self.cols)
        return pd.DataFrame(data, columns=self.columns)
    
    def append(self, filename: str, rows: int):
        data = np.random.rand(rows, self.cols)
        df = pd.DataFrame(data, columns=self.columns)
        if os.path.exists(filename) and filename.endswith('.parquet'):
            former = pd.read_parquet(filename)
            df = pd.concat([former, df])
            df.to_parquet(filename, engine='pyarrow')
        else:
            df.to_csv(filename, mode='a', header=False, index=False)
            
    def export(self, df, filepath):
        if filepath.endswith('.parquet'):
            df.to_parquet(filepath, engine='pyarrow')
        else:
            df.to_csv(filepath, index=False)

def run(command:str, cwd:str):
    out = subprocess.run(command, shell=True, capture_output=True, cwd=cwd).stdout
    if out:
        out = out.decode()
    print(out)
    return out



def git_commit(filename: str, cwd:str):
    filename = os.path.basename(filepath)
    command = f"""                                
            git add {filename}
            git commit -m "upload"
            """
    return run(command, cwd)


def git_push(filename: str, cwd:str):
    filename = os.path.basename(filepath)
    command = f"""                                            
            git push
            """
    return run(command, cwd)



def s3_upload(filepath: str, cwd:str):
    command = f"aws s3 cp {filepath} {S3_BUCKET}{os.path.basename(filepath)}"
    return run(command, cwd)

gitxet_version = subprocess.run("git xet --version", shell=True, capture_output=True).stdout.decode('utf-8')

# Experiment

In [32]:
start_rows = 6000000  # ~1G
n_rows_add = 6000  # ~1MB
columns = 10
iterations = 10

In [37]:
start_rows = 1000000
n_rows_add = 10000
columns = 10
iterations = 2

filename = 'data.txt'
xet_repo_path = os.path.join(os.getcwd(), XET_REPO)
filepath = os.path.join(xet_repo_path, filename)


generator = NumericDataGenerator(cols=columns)
df = generator.generate(start_rows)
generator.export(df, filepath)

tracker = Tracker('stats.db', 
                  verbose=False, params={'workflow': 'numeric',
                              'n_rows_add': n_rows_add,
                              'start_rows': start_rows,
                              'columns': columns,
                              'filepath': filepath,
                              'pyxet': pyxet.__version__,
                              'gitxet': gitxet_version,
                              'merged': True,
                              'file_bytes': os.path.getsize(filepath),                                
                              })


for iteration in tqdm(range(iterations)):
    start_time = time.time()
    tracker.track(git_commit, args=[filepath, XET_REPO], params={'step':iteration, 'tech':'xethub'})    
    print(f"gitxet commit time: {time.time()-start_time}")
    
    start_time = time.time()
    tracker.track(git_push, args=[filepath, XET_REPO], params={'step':iteration, 'tech':'xethub'})    
    print(f"gitxet push time: {time.time()-start_time}")
    
    start_time = time.time()
    tracker.track(s3_upload, args=[filepath, XET_REPO], params={'step':iteration, 'tech':'s3'})    
    print(f"S3 upload time: {time.time()-start_time}")
    
    generator.append(filepath, n_rows_add)
    

  0%|                                                                                                                                         | 0/2 [00:00<?, ?it/s]

[main 3cdc06f] upload
 1 file changed, 3 insertions(+), 1101 deletions(-)

gitxet commit time: 4.170979976654053
b''
gitxet push time: 21.742537021636963


 50%|████████████████████████████████████████████████████████████████▌                                                                | 1/2 [00:42<00:42, 42.11s/it]

Completed 256.0 KiB/183.8 MiB (82.0 KiB/s) with 1 file(s) remainingCompleted 512.0 KiB/183.8 MiB (143.5 KiB/s) with 1 file(s) remainingCompleted 768.0 KiB/183.8 MiB (215.1 KiB/s) with 1 file(s) remainingCompleted 1.0 MiB/183.8 MiB (283.5 KiB/s) with 1 file(s) remaining  Completed 1.2 MiB/183.8 MiB (353.6 KiB/s) with 1 file(s) remaining  Completed 1.5 MiB/183.8 MiB (418.9 KiB/s) with 1 file(s) remaining  Completed 1.8 MiB/183.8 MiB (485.5 KiB/s) with 1 file(s) remaining  Completed 2.0 MiB/183.8 MiB (554.2 KiB/s) with 1 file(s) remaining  Completed 2.2 MiB/183.8 MiB (623.1 KiB/s) with 1 file(s) remaining  Completed 2.5 MiB/183.8 MiB (689.0 KiB/s) with 1 file(s) remaining  Completed 2.8 MiB/183.8 MiB (595.7 KiB/s) with 1 file(s) remaining  Completed 3.0 MiB/183.8 MiB (600.5 KiB/s) with 1 file(s) remaining  Completed 3.2 MiB/183.8 MiB (647.9 KiB/s) with 1 file(s) remaining  Completed 3.5 MiB/183.8 MiB (695.9 KiB/s) with 1 file(s) remaining  Completed 3.8 MiB/183.8 MiB (738.0 

[main cc5202d] upload
 1 file changed, 2 insertions(+), 2 deletions(-)

gitxet commit time: 9.113176107406616
b''
gitxet push time: 20.932862997055054


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:28<00:00, 44.22s/it]

Completed 256.0 KiB/185.6 MiB (84.3 KiB/s) with 1 file(s) remainingCompleted 512.0 KiB/185.6 MiB (144.1 KiB/s) with 1 file(s) remainingCompleted 768.0 KiB/185.6 MiB (215.0 KiB/s) with 1 file(s) remainingCompleted 1.0 MiB/185.6 MiB (286.4 KiB/s) with 1 file(s) remaining  Completed 1.2 MiB/185.6 MiB (356.6 KiB/s) with 1 file(s) remaining  Completed 1.5 MiB/185.6 MiB (425.3 KiB/s) with 1 file(s) remaining  Completed 1.8 MiB/185.6 MiB (481.8 KiB/s) with 1 file(s) remaining  Completed 2.0 MiB/185.6 MiB (549.6 KiB/s) with 1 file(s) remaining  Completed 2.2 MiB/185.6 MiB (618.2 KiB/s) with 1 file(s) remaining  Completed 2.5 MiB/185.6 MiB (686.9 KiB/s) with 1 file(s) remaining  Completed 2.8 MiB/185.6 MiB (614.0 KiB/s) with 1 file(s) remaining  Completed 3.0 MiB/185.6 MiB (603.4 KiB/s) with 1 file(s) remaining  Completed 3.2 MiB/185.6 MiB (651.4 KiB/s) with 1 file(s) remaining  Completed 3.5 MiB/185.6 MiB (699.7 KiB/s) with 1 file(s) remaining  Completed 3.8 MiB/185.6 MiB (746.1 




In [38]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe"


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

results = tracker.to_df()
results['name'] = results['name'].str.replace('_upload', '').str.replace('_',' ')
results['mb/s'] = results['file_bytes']/results['time']

print(f"Steps: {results['step'].max()+1}")
print(f"Data size: {len(df)}")
print("\nTime per tech - lower is better")
px.bar(results, x='name', y='time', color='tech').show()

Steps: 2
Data size: 1000000

Time per tech - lower is better


# Manuel

In [None]:
!(cd xet_repo && git add . && git commit -m "deleting" && git push)

In [41]:
start_rows = 6000000
n_rows_add = 60000
columns = 10

filename = 'data.txt'
xet_repo_path = os.path.join(os.getcwd(), XET_REPO)
filepath = os.path.join(xet_repo_path, filename)


generator = NumericDataGenerator(cols=columns)
df = generator.generate(start_rows)
generator.export(df, filepath)

In [44]:
generator.append(filepath, n_rows_add)

In [45]:
!(cd xet_repo && git add . && git commit -m "uploading")

git-xet 0.11.0 filter started



7[1F[1GXet: Deduplicating data blocks: 13.34 KiB | 13.34 KiB/s.87[1F[1GXet: Deduplicating data blocks: 6.30 MiB | 6.30 MiB/s.  87[1F[1GXet: Deduplicating data blocks: 12.89 MiB | 12.89 MiB/s.87[1F[1GXet: Deduplicating data blocks: 19.90 MiB | 19.90 MiB/s.87[1F[1GXet: Deduplicating data blocks: 27.29 MiB | 27.29 MiB/s.87[1F[1GXet: Deduplicating data blocks: 34.64 MiB | 34.64 MiB/s.87[1F[1GXet: Deduplicating data blocks: 41.20 MiB | 41.20 MiB/s.87[1F[1GXet: Deduplicating data blocks: 47.72 MiB | 47.72 MiB/s.87[1F[1GXet: Deduplicating data blocks: 52.31 MiB | 26.15 MiB/s.87[1F[1GXet: Deduplicating data blocks: 54.68 MiB | 27.34 MiB/s.87[1F[1GXet: Deduplicating data blocks: 56.96 MiB | 28.48 MiB/s.87[1F[1GXet: Deduplicating data blocks: 59.30 MiB | 29.65 MiB/s.87[1F[1GXet: Deduplicating data blocks: 61.38 MiB | 20.46 MiB/s.87[1F[1GXet: Deduplicating data blocks: 63.77 MiB | 21.26 MiB/s.87[1F[1GXet: Deduplicating data blocks: 66.11 

7[1F[1GXet: Deduplicating data blocks: 758.69 MiB | 25.29 MiB/s.87[1F[1GXet: Deduplicating data blocks: 765.37 MiB | 25.51 MiB/s.87[1F[1GXet: Deduplicating data blocks: 771.90 MiB | 25.73 MiB/s.87[1F[1GXet: Deduplicating data blocks: 778.70 MiB | 25.96 MiB/s.87[1F[1GXet: Deduplicating data blocks: 785.20 MiB | 25.33 MiB/s.87[1F[1GXet: Deduplicating data blocks: 791.74 MiB | 25.54 MiB/s.87[1F[1GXet: Deduplicating data blocks: 798.47 MiB | 25.76 MiB/s.87[1F[1GXet: Deduplicating data blocks: 804.77 MiB | 25.96 MiB/s.87[1F[1GXet: Deduplicating data blocks: 811.51 MiB | 25.36 MiB/s.87[1F[1GXet: Deduplicating data blocks: 818.11 MiB | 25.57 MiB/s.87[1F[1GXet: Deduplicating data blocks: 824.79 MiB | 25.77 MiB/s.87[1F[1GXet: Deduplicating data blocks: 831.49 MiB | 25.98 MiB/s.87[1F[1GXet: Deduplicating data blocks: 838.11 MiB | 25.40 MiB/s.87[1F[1GXet: Deduplicating data blocks: 844.77 MiB | 25.60 MiB/s.87[1F[1GXet: Deduplicating data 

7[1F[1GXet: Deduplicating data blocks: 1.06 GiB | 18.05 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.06 GiB | 18.08 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.06 GiB | 18.11 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.06 GiB | 18.14 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.06 GiB | 17.87 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.07 GiB | 17.89 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.07 GiB | 17.92 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.07 GiB | 17.95 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.07 GiB | 17.69 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.07 GiB | 17.72 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.07 GiB | 17.74 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.08 GiB | 17.77 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.08 GiB | 17.52 MiB/s.   87[1F[1GXet: Deduplicating data blocks: 1.08 GiB | 17.56 MiB/s.   87[1F[1GXet: Dedup

In [46]:
!(cd xet_repo && GIT_TRACE=2 GIT_CURL_VERBOSE=2 GIT_TRACE_PERFORMANCE=2 GIT_TRACE_PACK_ACCESS=2 GIT_TRACE_PACKET=2 GIT_TRACE_PACKFILE=2 GIT_TRACE_SETUP=2 GIT_TRACE_SHALLOW=2 git push)

00:57:55.137805 trace.c:315             setup: git_dir: .git
00:57:55.138031 trace.c:316             setup: git_common_dir: .git
00:57:55.138037 trace.c:317             setup: worktree: /Users/yonatanalexander/development/xethub/versioning-article/notebooks/xet_repo
00:57:55.138039 trace.c:318             setup: cwd: /Users/yonatanalexander/development/xethub/versioning-article/notebooks/xet_repo
00:57:55.138041 trace.c:319             setup: prefix: (null)
00:57:55.138043 git.c:462               trace: built-in: git push
00:57:55.138690 run-command.c:661       trace: run_command: GIT_DIR=.git git remote-https origin https://xdssio:zGzchrjQr9C5-DMp1jWNkA@xethub.com/xdssio/xethub-git.git
00:57:55.147118 git.c:748               trace: exec: git-remote-https origin https://xdssio:zGzchrjQr9C5-DMp1jWNkA@xethub.com/xdssio/xethub-git.git
00:57:55.147323 run-command.c:661       trace: run_command: git-remote-https origin https://xdssio:zGzchrjQr9C5-DMp1jWNkA@xethub.com/xdssio/xethub-git.git
0

00:57:56.530773 http.c:764              <= Recv header, 0000000013 bytes (0x0000000d)
00:57:56.530803 http.c:776              <= Recv header: HTTP/2 200
00:57:56.530809 http.c:764              <= Recv header, 0000000037 bytes (0x00000025)
00:57:56.530812 http.c:776              <= Recv header: date: Tue, 22 Aug 2023 22:57:56 GMT
00:57:56.530817 http.c:764              <= Recv header, 0000000060 bytes (0x0000003c)
00:57:56.530820 http.c:776              <= Recv header: content-type: application/x-git-receive-pack-advertisement
00:57:56.530827 http.c:764              <= Recv header, 0000000021 bytes (0x00000015)
00:57:56.530830 http.c:776              <= Recv header: content-length: 660
00:57:56.530834 http.c:764              <= Recv header, 0000000031 bytes (0x0000001f)
00:57:56.530837 http.c:776              <= Recv header: server: nginx/1.18.0 (Ubuntu)
00:57:56.530841 http.c:764              <= Recv header, 0000000053 bytes (0x00000035)
00:57:56.530844 http.c:776              <= Recv 

00:58:16.963343 http.c:764              <= Recv header, 0000000013 bytes (0x0000000d)
00:58:16.963376 http.c:776              <= Recv header: HTTP/2 200
00:58:16.963383 http.c:764              <= Recv header, 0000000037 bytes (0x00000025)
00:58:16.963387 http.c:776              <= Recv header: date: Tue, 22 Aug 2023 22:58:16 GMT
00:58:16.963392 http.c:764              <= Recv header, 0000000053 bytes (0x00000035)
00:58:16.963396 http.c:776              <= Recv header: content-type: application/x-git-receive-pack-result
00:58:16.963404 http.c:764              <= Recv header, 0000000021 bytes (0x00000015)
00:58:16.963408 http.c:776              <= Recv header: content-length: 113
00:58:16.963412 http.c:764              <= Recv header, 0000000031 bytes (0x0000001f)
00:58:16.963416 http.c:776              <= Recv header: server: nginx/1.18.0 (Ubuntu)
00:58:16.963420 http.c:764              <= Recv header, 0000000039 bytes (0x00000027)
00:58:16.963424 http.c:776              <= Recv header:

In [47]:
generator.append(filepath, n_rows_add)

In [51]:
st = time.time()
command = f"""                                
        git add {filename}
        git commit -m "upload"
        """
out = subprocess.run(command, shell=True, capture_output=True, cwd=XET_REPO).stdout
print(out)
print(time.time()-st)

b'[main 0ecc779] upload\n 1 file changed, 2 insertions(+), 2 deletions(-)\n'


In [52]:
st = time.time()
command = f"""                                        
        GIT_TRACE=2 GIT_CURL_VERBOSE=2 GIT_TRACE_PERFORMANCE=2 GIT_TRACE_PACK_ACCESS=2 GIT_TRACE_PACKET=2 GIT_TRACE_PACKFILE=2 GIT_TRACE_SETUP=2 GIT_TRACE_SHALLOW=2 git push
        """
out = subprocess.run(command, shell=True, capture_output=True, cwd=XET_REPO).stdout
print(out)
print(time.time()-st)

b''
26.02121114730835


In [53]:
generator.append(filepath, n_rows_add)

In [54]:
st = time.time()
command = f"""                                
        git add {filename}
        git commit -m "upload"
        """
out = subprocess.run(command, shell=True, capture_output=True, cwd=XET_REPO).stdout
print(out)
print(time.time()-st)

b'[main 09a0699] upload\n 1 file changed, 2 insertions(+), 2 deletions(-)\n'
86.66036415100098


In [55]:
st = time.time()
command = f"""                                        
        git push
        """
out = subprocess.run(command, shell=True, capture_output=True, cwd=XET_REPO).stdout
print(out)
print(time.time()-st)

b''
24.28293490409851
