In [None]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

# End-to-End Experiment Machine Learning - Dataset Versioning

In this notebook, we will try **Data Version Control (DVC)** as our special version control tool for dataset.

## Outline

* Setting up DVC
* Data versioning
* Data and model access

## Constants & Hyperparameters

Let's define some constants and hyperparameters we'll use through out this demo.

In [None]:
DATA_ROOT_DIR = Path("../data")
DATA_RAW_DIR = DATA_ROOT_DIR / "raw"
DATA_SPLIT_DIR = DATA_ROOT_DIR / "split"
PHISHING_DATA_PATH = DATA_RAW_DIR / "phishing.csv"
AIRBNB_DATA_PATH = DATA_RAW_DIR / "airbnb.csv"

## Dataset

In [None]:
phishing_data = pd.read_csv(PHISHING_DATA_PATH)

In [None]:
phishing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11350 entries, 0 to 11349
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   target        11350 non-null  int64  
 1   month         11350 non-null  int64  
 2   scrape_date   11350 non-null  object 
 3   ext           11350 non-null  object 
 4   urlLength     11350 non-null  int64  
 5   numDigits     11350 non-null  int64  
 6   numParams     11350 non-null  int64  
 7   num_%20       11350 non-null  int64  
 8   num_@         11350 non-null  int64  
 9   entropy       11350 non-null  float64
 10  has_ip        11350 non-null  int64  
 11  hasHttp       11350 non-null  bool   
 12  hasHttps      11350 non-null  bool   
 13  urlIsLive     11350 non-null  bool   
 14  dsr           11350 non-null  int64  
 15  dse           11350 non-null  int64  
 16  bodyLength    11350 non-null  int64  
 17  numTitles     11350 non-null  int64  
 18  numImages     11350 non-nu

In [None]:
phishing_data.sample(10, random_state=11)

Unnamed: 0,target,month,scrape_date,ext,urlLength,numDigits,numParams,num_%20,num_@,entropy,...,dse,bodyLength,numTitles,numImages,numLinks,specialChars,scriptLength,sbr,bscr,sscr
9646,0,11,2019-11-20,other,89,8,1,0,0,-4.506572,...,0,169,2,0,0,29,0,0.0,0.171598,0.0
10622,0,12,2019-12-22,country,139,6,1,0,0,-4.504071,...,0,5205,5,1,1,986,2606,0.630816,0.213374,2.956389
1919,0,3,2019-03-04,country,137,6,0,0,0,-4.403196,...,0,4617,5,0,1,983,2911,0.630496,0.212909,2.961343
7137,0,8,2019-08-28,html,96,0,0,0,0,-4.141039,...,163,37,1,0,0,4,0,0.0,0.108108,0.0
3244,0,4,2019-04-17,com,92,4,0,0,0,-4.434921,...,48,15937,11,49,278,4124,7644,0.479639,0.258769,1.85354
83,0,1,2019-01-03,com,104,11,0,0,0,-4.671098,...,372,82401,12,28,69,21093,58723,0.712649,0.25598,2.227203
2561,0,3,2019-03-26,country,100,8,0,0,0,-4.607091,...,0,153173,1,0,191,32803,5406,0.035293,0.214157,0.164802
10202,0,12,2019-12-08,html,97,7,0,0,0,-4.280045,...,0,5767,10,16,119,1962,3284,0.474292,0.283362,1.673802
1301,0,2,2019-02-13,net,157,7,1,0,0,-4.816776,...,1177,48791,38,29,880,11736,20718,0.424627,0.240536,1.765337
6502,0,8,2019-08-05,com,105,0,0,0,0,-4.227846,...,32,0,0,0,0,0,0,0.0,0.0,0.0


## Split Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    phishing_data.drop(columns=["target", "scrape_date"]),
    phishing_data["target"],
    test_size=.2,
    random_state=11
)

In [None]:
phishing_train = pd.concat([X_train, y_train], axis=1)
phishing_test = pd.concat([X_test, y_test], axis=1)

print("Train size:", phishing_train.shape[0])
print("Test size:", phishing_test.shape[0])

Train size: 9080
Test size: 2270


## Save Splitted Dataset

In [None]:
if not DATA_SPLIT_DIR.exists():
    DATA_SPLIT_DIR.mkdir(parents=True)

assert DATA_SPLIT_DIR.exists()

In [None]:
phishing_train.to_csv(DATA_SPLIT_DIR / "phishing_train.csv", index=False)
phishing_test.to_csv(DATA_SPLIT_DIR / "phishing_test.csv", index=False)

## Track Data

* To track dataset, use `dvc add` command. You can run it via terminal with command below.

    ```bash
    dvc add ../data/split/phishing_train.csv ../data/split/phishing_test.csv
    ```

    or, via jupyter cell with command:

    ```ipython
    !dvc add ../data/split/phishing_train.csv ../data/split/phishing_test.csv
    ```

    You will be shown something similar with below image:
    ![](img/tracked_data_result.png)

* After that, there should be more files now in the repository, especially in the same directory with the tracked data.

    ![](img/dvc_files.png)

* Finally, to track the the `.dvc` files of the tracked data using Git, we just add all the files in `data/split` and commit them.

    ![](img/add_dvc_files.png)

## Next Version

Let's split the dataset with different seed to demonstrate multiple versioning

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    phishing_data.drop(columns=["target", "scrape_date"]),
    phishing_data["target"],
    test_size=.2,
    random_state=111
)

In [None]:
phishing_train = pd.concat([X_train, y_train], axis=1)
phishing_test = pd.concat([X_test, y_test], axis=1)

print("Train size:", phishing_train.shape[0])
print("Test size:", phishing_test.shape[0])

Train size: 9080
Test size: 2270


In [None]:
phishing_train.to_csv(DATA_SPLIT_DIR / "phishing_train.csv", index=False)
phishing_test.to_csv(DATA_SPLIT_DIR / "phishing_test.csv", index=False)

The resulted split is now different with the previous one due to different `random_state`. This will trigger DVC to detect **modification**. Similar to `git status`, we can check the status of the DVC-related files using command `dvc status`.

![dvc_status](img/dvc_status.png)

To track the new version, you need to:
* `dvc add` those new versions
* add a new remote storage versions and set as the default remote storage
* `git add` and `git commit` the changes
* `dvc push` the new versions to the remote storage

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8f5c09b4-3349-4c4e-9128-93e08a4345f5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>