## Setup

In [2]:
import os

In [5]:
!dir

 Volume in drive C is Windows-SSD
 Volume Serial Number is 0480-145C

 Directory of c:\Users\Vaishali Agarwal\OneDrive\Documents\CMI\Applied ML\Assignment_02\data_version_control

20-02-2024  23:47    <DIR>          .
20-02-2024  20:17    <DIR>          ..
20-02-2024  20:09    <DIR>          .dvc
20-02-2024  20:09               142 .dvcignore
20-02-2024  20:36    <DIR>          data
20-02-2024  23:47               234 prepare.ipynb
               2 File(s)            376 bytes
               4 Dir(s)  171,271,278,592 bytes free


In [11]:
!dvc init --subdir
!git init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>
Reinitialized existing Git repository in C:/Users/Vaishali Agarwal/OneDrive/Documents/CMI/Applied ML/Assignment_02/data_version_control/.git/


In [14]:
!git status
!git commit -m "DVC initialized"

On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	new file:   .dvc/.gitignore
	new file:   .dvc/config
	new file:   .dvcignore

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/config

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	data/
	prepare.ipynb

[master (root-commit) 49fe1d0] DVC initialized
 3 files changed, 6 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore


#### Setting google drive as remote storage

In [81]:
!dvc remote add -d remote_storage gdrive://10erQZzAuN836exH6vS-VsMSXd0dJytwt -f

Setting 'remote_storage' as a default remote.


In [33]:
!git add .dvc/config
!git status
!git commit -m "Updated remote storage"

On branch master
Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	modified:   .dvc/config

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	data/raw/emails.csv
	prepare.ipynb

[master 9469b5b] Updated remote storage
 1 file changed, 1 insertion(+), 1 deletion(-)


## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import string
import re

from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet", quiet = True)
nltk.download("stopwords", quiet = True)

## Pre-process data

In [12]:
def preprocess(data):

    ## add coloumn "length" of text
    data["length"] = data["text"].apply(len)
    data = data[data["length"] < 20000].reset_index(drop = True)
    data.drop(["length"], axis = 1, inplace = True)

     ## convert the text into lower case
    data["text"] = data["text"].apply(str.lower)

    ## remove the punctuations from the text column
    remove_punc = str.maketrans("", "", string.punctuation)
    data["text"] = data["text"].apply(lambda x: x.translate(remove_punc))

    ## remove the words "subject" and "re" from the text column
    data["text"] = data["text"].apply(lambda x: x.replace("subject", "", 1))
    data["text"] = data["text"].apply(lambda x: x.replace("re", "", 1))

    ## remove numbers from the text column
    data["text"] = data["text"].apply(lambda x: re.sub(r"\d+", "", x))

    ## remove stopwords from the text column
    stopwords = nltk.corpus.stopwords.words("english")

    ## split the sentence into words to figure out the stopwords, use whitespacetokenizer for that
    tokenizer = WhitespaceTokenizer()
    data["text"] = data["text"].apply(lambda x: tokenizer.tokenize(x))
    data["text"] = data["text"].apply(lambda x: [word for word in x if word not in stopwords])

    ## lemmatization using the wordnet
    lemmatizer = WordNetLemmatizer()
    data["text"] = data["text"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    ## remove single character words
    data["text"] = data["text"].apply(lambda x: [word for word in x if len(word)>1])

    ## join the tokens into the string
    data["text"] = data["text"].apply(lambda x: " ".join(x))
    return data 

In [13]:
def train_val_test(data, seed):
    train, test = train_test_split(data, test_size = 0.1, random_state = seed)
    train, val = train_test_split(train, test_size = 0.11, random_state = seed) ## 0.9*0.11 = 0.099 ~ 0.1
    return train, val, test

In [41]:
## load the raw dataset
data = pd.read_csv("./data/raw/emails.csv")
## clean and preprocess the data
raw_data = preprocess(data)
## split data into train, validation and test datasets with specific random seed
train, val, test = train_val_test(raw_data, 64)

#### Save the data

In [42]:
## save the pre-processed raw data in the raw folder
raw_data.to_csv("./data/raw/raw_data.csv", header = True, index = False)

## save the split data in the prepared folder
train.to_csv("./data/prepared/train.csv", header = True, index = False)
val.to_csv("./data/prepared/validation.csv", header = True, index = False)
test.to_csv("./data/prepared/test.csv", header = True, index = False)

## Track data versions with dvc

In [4]:
current_dir = os.getcwd()
os.chdir("./data/raw")

### For raw data

In [48]:
!dvc add raw_data.csv


To track the changes with git, run:

	git add raw_data.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [49]:
!git status
!git add raw_data.csv.dvc .gitignore
!git commit -m "updates raw data"

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../
	../../prepare.ipynb

nothing added to commit but untracked files present (use "git add" to track)
[master 9242445] updates raw data
 2 files changed, 6 insertions(+)
 create mode 100644 data/raw/.gitignore
 create mode 100644 data/raw/raw_data.csv.dvc


### For split data

In [5]:
os.chdir(current_dir)
os.chdir("./data/prepared")

In [58]:
!dvc add train.csv validation.csv test.csv


To track the changes with git, run:

	git add .gitignore validation.csv.dvc test.csv.dvc train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [59]:
!git status
!git add train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore
!git commit -m "initialize data versioning with dvc"

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	./
	../raw/emails.csv
	../../prepare.ipynb

nothing added to commit but untracked files present (use "git add" to track)
[master 0070648] initialize data versioning with dvc
 4 files changed, 18 insertions(+)
 create mode 100644 data/prepared/.gitignore
 create mode 100644 data/prepared/test.csv.dvc
 create mode 100644 data/prepared/train.csv.dvc
 create mode 100644 data/prepared/validation.csv.dvc


In [82]:
!dvc push

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

Authentication successful.
4 files pushed




### Data versions with different random seed

In [84]:
train, val, test = train_val_test(raw_data, 74)

In [86]:
## save the data
train.to_csv("./train.csv", header = True, index = False)
val.to_csv("./validation.csv", header = True, index = False)
test.to_csv("./test.csv", header = True, index = False)

In [87]:
## add updated data splits to DVC
!dvc add train.csv validation.csv test.csv


To track the changes with git, run:

	git add validation.csv.dvc test.csv.dvc train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true


⠋ Checking graph



In [88]:
!git status
!git add train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore
!git commit -m "updated data splits with new random seed"

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   ../../.dvc/config
	modified:   test.csv.dvc
	modified:   train.csv.dvc
	modified:   validation.csv.dvc

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../raw/emails.csv
	../../prepare.ipynb

no changes added to commit (use "git add" and/or "git commit -a")
[master 2327bbc] updated data splits with new random seed
 3 files changed, 6 insertions(+), 6 deletions(-)


In [89]:
## push the new data split version to remote storage
!dvc push

3 files pushed


## Chechout the versions

In [14]:
!git log

commit 2327bbc7ee40810db990f0f4b7751acbcc4862ee
Author: vaishaliag08 <vaishaliagarwal2008@gmail.com>
Date:   Wed Feb 21 19:29:56 2024 +0530

    updated data splits with new random seed

commit 007064838b060a1860b9418f6dded7280ce3607e
Author: vaishaliag08 <vaishaliagarwal2008@gmail.com>
Date:   Wed Feb 21 17:31:11 2024 +0530

    initialize data versioning with dvc

commit 9242445560b0d4cf7319adffb27f610543b42270
Author: vaishaliag08 <vaishaliagarwal2008@gmail.com>
Date:   Wed Feb 21 17:24:13 2024 +0530

    updates raw data

commit cdc86d9b7de5ca58e671de1411081626b6ef66cb
Author: vaishaliag08 <vaishaliagarwal2008@gmail.com>
Date:   Wed Feb 21 15:48:20 2024 +0530

    Updated remote storage

commit 49fe1d00f8f206d5682092ee5be9c1b3995233ed
Author: vaishaliag08 <vaishaliagarwal2008@gmail.com>
Date:   Wed Feb 21 15:44:48 2024 +0530

    DVC initialized


#### First split

In [15]:
!git checkout 007064838b060a1860b9418f6dded7280ce3607e train.csv.dvc validation.csv.dvc test.csv.dvc

Updated 0 paths from 6ea6557


In [16]:
!dvc checkout

In [17]:
train = pd.read_csv("./train.csv")
val = pd.read_csv("./validation.csv")
test = pd.read_csv("./test.csv")

In [25]:
print("Train distribution:")
print("Number of 0s: {}".format(train["spam"].value_counts()[0]))
print("Number of 1s: {}".format(train["spam"].value_counts()[1]))
print("-"*32)
print("Validation distribution:")
print("Number of 0s: {}".format(val["spam"].value_counts()[0]))
print("Number of 1s: {}".format(val["spam"].value_counts()[1]))
print("-"*32)
print("Test distribution:")
print("Number of 0s: {}".format(test["spam"].value_counts()[0]))
print("Number of 1s: {}".format(test["spam"].value_counts()[1]))
print("-"*32)

Train distribution:
Number of 0s: 3487
Number of 1s: 1095
--------------------------------
Validation distribution:
Number of 0s: 439
Number of 1s: 128
--------------------------------
Test distribution:
Number of 0s: 430
Number of 1s: 143
--------------------------------


#### Second split

In [26]:
!git checkout 2327bbc7ee40810db990f0f4b7751acbcc4862ee train.csv.dvc validation.csv.dvc test.csv.dvc

Updated 3 paths from ac98dfe


In [27]:
!dvc checkout

M       train.csv
M       test.csv
M       validation.csv


In [28]:
train = pd.read_csv("./train.csv")
val = pd.read_csv("./validation.csv")
test = pd.read_csv("./test.csv")

In [29]:
print("Train distribution:")
print("Number of 0s: {}".format(train["spam"].value_counts()[0]))
print("Number of 1s: {}".format(train["spam"].value_counts()[1]))
print("-"*32)
print("Validation distribution:")
print("Number of 0s: {}".format(val["spam"].value_counts()[0]))
print("Number of 1s: {}".format(val["spam"].value_counts()[1]))
print("-"*32)
print("Test distribution:")
print("Number of 0s: {}".format(test["spam"].value_counts()[0]))
print("Number of 1s: {}".format(test["spam"].value_counts()[1]))
print("-"*32)

Train distribution:
Number of 0s: 3479
Number of 1s: 1103
--------------------------------
Validation distribution:
Number of 0s: 438
Number of 1s: 129
--------------------------------
Test distribution:
Number of 0s: 439
Number of 1s: 134
--------------------------------


In [30]:
!git add 
!git commit

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   ../../.dvc/config

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../raw/emails.csv
	../../prepare.ipynb

no changes added to commit (use "git add" and/or "git commit -a")
