# Assignment 1: Data Versioning and Differential Privacy
## Peter Ye

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lakefs 

## Tool 1 - LakeFS

### Task 1

In [2]:
datav1 = pd.read_csv('athletes.csv')

In [3]:
datav1.head(10)

Unnamed: 0,athlete_id,name,region,team,affiliate,gender,age,height,weight,fran,...,snatch,deadlift,backsq,pullups,eat,train,background,experience,schedule,howlong
0,2554.0,Pj Ablang,South West,Double Edge,Double Edge CrossFit,Male,24.0,70.0,166.0,,...,,400.0,305.0,,,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|I r...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|
1,3517.0,Derek Abdella,,,,Male,42.0,70.0,190.0,,...,,,,,,I have a coach who determines my programming|I...,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 2x a week|,4+ years|
2,4691.0,,,,,,,,,,...,,,,,,,,,,
3,5164.0,Abo Brandon,Southern California,LAX CrossFit,LAX CrossFit,Male,40.0,67.0,,211.0,...,200.0,375.0,325.0,25.0,I eat 1-3 full cheat meals per week|,I workout mostly at a CrossFit Affiliate|I hav...,I played youth or high school level sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|,4+ years|
4,5286.0,Bryce Abbey,,,,Male,32.0,65.0,149.0,206.0,...,150.0,,325.0,50.0,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I inc...,I played college sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I strictly s...,1-2 years|
5,6491.0,Jason Abney,,,,Male,37.0,73.0,230.0,,...,200.0,435.0,414.0,,I eat strict Paleo|,I workout mostly at a CrossFit Affiliate|I rec...,I played youth or high school level sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I typically ...,4+ years|
6,6610.0,Anthony Abeel,South Central,Get Lifted,Get Lifted CrossFit,Male,21.0,72.0,175.0,,...,0.0,0.0,0.0,0.0,I eat quality foods but don't measure the amou...,I workout mostly at a CrossFit Affiliate|I hav...,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|
7,7268.0,,,,,,,,,,...,,,,,,,,,,
8,7463.0,Tye Abell,,,,Male,30.0,72.0,175.0,,...,0.0,0.0,0.0,0.0,,,,,,
9,8242.0,Ryan Achilles,,,,Male,40.0,68.0,177.0,205.0,...,185.0,365.0,365.0,,I eat quality foods but don't measure the amount|,I workout mostly at a CrossFit Affiliate|I rec...,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I typically ...,2-4 years|


In [4]:
datav1.columns

Index(['athlete_id', 'name', 'region', 'team', 'affiliate', 'gender', 'age',
       'height', 'weight', 'fran', 'helen', 'grace', 'filthy50', 'fgonebad',
       'run400', 'run5k', 'candj', 'snatch', 'deadlift', 'backsq', 'pullups',
       'eat', 'train', 'background', 'experience', 'schedule', 'howlong'],
      dtype='object')

* CandJ (Clean and Jerk): Maximum weight the athlete can lift in a clean and jerk movement.
* Snatch: Maximum weight the athlete can lift in the snatch movement.
* Deadlift: Maximum weight the athlete can deadlift.
* Backsq (Back Squat): Maximum weight the athlete can back squat.

### Task 2

In [5]:
# Remove irrelevant columns and handle missing values
datav2 = datav1.dropna(subset=['region','age','weight','height','howlong','gender','eat',
                               'train','background','experience','schedule','howlong',
                               'deadlift','candj','snatch','backsq','experience',
                               'background','schedule','howlong'])
datav2 = datav2.drop(columns=['affiliate','team','name','athlete_id','fran','helen','grace',
                              'filthy50','fgonebad','run400','run5k','pullups','train'])

In [6]:
# Remove outliers
datav2 = datav2[datav2['weight'] < 1500]
datav2 = datav2[datav2['gender'] != '--']
datav2 = datav2[datav2['age'] >= 18]
datav2 = datav2[(datav2['height'] < 96) & (datav2['height'] > 48)]
datav2 = datav2[(datav2['deadlift'] > 0) & 
                ((datav2['deadlift'] <= 1105) | 
                ((datav2['gender'] == 'Female') & (datav2['deadlift'] <= 636)))]
datav2 = datav2[(datav2['candj'] > 0) & (datav2['candj'] <= 395)]
datav2 = datav2[(datav2['snatch'] > 0) & (datav2['snatch'] <= 496)]
datav2 = datav2[(datav2['backsq'] > 0) & (datav2['backsq'] <= 1069)]

In [7]:
# Clean survey data
decline_dict = {'Decline to answer|': np.nan}
datav2 = datav2.replace(decline_dict)
datav2 = datav2.dropna(subset=['background','experience','schedule','howlong','eat'])

In [8]:
datav2.head(10)

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong
21,Southern California,Male,30.0,71.0,200.0,235.0,175.0,385.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 1x a week|I ty...,1-2 years|
22,Africa,Male,28.0,70.0,176.0,187.0,134.0,335.0,254.0,I eat 1-3 full cheat meals per week|,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 1x a week|,2-4 years|
27,North East,Male,35.0,68.0,225.0,285.0,205.0,440.0,405.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I typically rest 4 or more days per month|,2-4 years|
50,North Central,Male,36.0,71.0,199.0,267.0,212.0,485.0,390.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,1-2 years|
60,North East,Male,36.0,64.0,155.0,245.0,180.0,415.0,385.0,I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 2x a week|I st...,4+ years|
78,North Central,Male,29.0,69.0,190.0,275.0,205.0,485.0,475.0,I eat quality foods but don't measure the amou...,I played college sports|I regularly play recre...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a week|,6-12 months|
101,South East,Male,44.0,69.0,168.0,245.0,185.0,435.0,405.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I usually only do 1 workout a day|I typically ...,4+ years|
102,South Central,Male,24.0,68.0,180.0,315.0,240.0,475.0,405.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|
103,South Central,Male,26.0,70.0,197.0,325.0,250.0,500.0,425.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|,4+ years|
110,South West,Male,31.0,67.0,150.0,230.0,175.0,410.0,330.0,I eat quality foods but don't measure the amount|,I played college sports|,I began CrossFit by trying it alone (without a...,I usually only do 1 workout a day|I do multipl...,2-4 years|


### Task 3

In [9]:
# For datav1
datav1['Total Lift'] = datav1['candj'] + datav1['snatch'] + datav1['deadlift'] + datav1['backsq']

# For datav2
datav2['Total Lift'] = datav2['candj'] + datav2['snatch'] + datav2['deadlift'] + datav2['backsq']

In [10]:
# Define the split ratio
train_ratio = 0.8

# Split datav1
train_datav1, test_datav1 = train_test_split(datav1, train_size=train_ratio, random_state=42)

# Split datav2
train_datav2, test_datav2 = train_test_split(datav2, train_size=train_ratio, random_state=42)

### Task 4

In [11]:
# Save both versions to local machine
datav1.to_csv("athletes_datav1.csv")
datav2.to_csv("athletes_datav2.csv")

In [12]:
# Establish connection to Lakefs (local server running on Docker as example)
from lakefs.client import Client

clt = Client(
    host="127.0.0.1:8000",
    username="AKIAIOSFOLQUICKSTART",
    password="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
)

In [14]:
# Create a repo named "athletes"
repo = lakefs.Repository("athletes", client = clt).create(storage_namespace="local://athletes_data_storage", exist_ok=True)

In [15]:
# List current repositories
print("Listing repositories:")
for repo in lakefs.repositories(client=clt):
    print(repo)

Listing repositories:
{'id': 'athletes', 'creation_date': 1729153245, 'default_branch': 'main', 'storage_namespace': 'local://athletes_data_storage'}


In [16]:
# Write datav1 to the "main branch"
with lakefs.Repository("athletes", client=clt).branch("main").object("athletes_datav1.csv").writer(mode="wb") as f:
   f.write(b"my data")

# Commit the change
lakefs.Repository("athletes", client=clt).branch("main").commit("added my datav1 athletes_datav1.csv")

Reference(repository="athletes", id="cf90f3a1242909d325665fb8d337ec284ef02bc6047fabb9aa2f9db44d874b1a")

In [17]:
# Make a branch called datav2 refer to main
repo.branch('datav2').create(source_reference='main') 

Branch(repository="athletes", id="datav2")

In [18]:
# Write datav2 to the "datav2 branch"
with lakefs.Repository("athletes", client=clt).branch("datav2").object("athletes_datav2.csv").writer(mode="wb") as f:
   f.write(b"my data")
   
# Commit the change
lakefs.Repository("athletes", client=clt).branch("datav2").commit("added my datav2 athletes_datav2.csv")

Reference(repository="athletes", id="b6174e420e9408138c52bee5f570667382e89a920e48d4cf329a87e3941afe7f")

In [19]:
# Merge the "datav2 branch" to "main branch"
src = repo.branch('datav2')
dst = repo.branch('main')
if any(dst.diff(src)):
    src.merge_into(dst)

In [20]:
# List current data stored in "main branch"
branch = repo.branch('main')
for entry in branch.objects():
    print(entry.path)

athletes_datav1.csv
athletes_datav2.csv


## Task 5