#### Init DVC

In [1]:
!dvc init --no-scm

Initialized DVC repository.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


#### Prepare the data

In [2]:
# Import libraries
import string
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Preprocess before loading into data frame
validchars = string.ascii_letters + string.digits + '\t .!?:()'

with open('data//data.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

processed_lines = []
for line in lines:
    line = line.replace('...', ' ')
    line = line.replace('..', ' ')
    line = line.replace(':)', ' emoji ')
    line = line.replace(':(', ' emoji ')
    line = line.replace(':-)', ' emoji ')
    line = line.replace(':-(', ' emoji ')
    line = re.sub(r'[+-]?((\d+\.?\d*)|(\.\d+))', ' number ', line)
    line = re.sub(r'[$£]+', ' currency ', line)
    line = ''.join(c for c in line if c in validchars)
    if line.startswith('spam'):
        parts = line.split('spam\t')
        if len(parts) == 2:
            text = parts[1]
            text = re.sub(r'\s+', ' ', text)
            processed_line = f"{text.strip()}" + ",1\n"
            processed_lines.append(processed_line) 
        else:
            print(f"Error: {line}")    

    elif line.startswith('ham'):
        parts = line.split('ham\t')
        if len(parts) == 2:
            text = parts[1]
            text = re.sub(r'\s+', ' ', text)
            processed_line = f"{text.strip()}" + ",0\n"
            processed_lines.append(processed_line)
        else:
            print(f"Error: {line}")
    else:
        processed_lines.append(line)
        print(f"Error: {line}")

with open('data//processed_data.csv', 'w', encoding='utf-8') as f:
    f.write("text,label\n")
    f.writelines(processed_lines)

In [3]:
# Read the data
df = pd.read_csv('data//processed_data.csv')

In [4]:
# Display 10 random rows
print(df.shape)
df.tail(10)

(5574, 2)


Unnamed: 0,text,label
5564,Ok lor Sony ericsson salesman I ask shuhui the...,0
5565,Ard number like dat lor.,0
5566,Why dont you wait til at least wednesday to se...,0
5567,Huh y lei,0
5568,REMINDER FROM O number : To get number pounds ...,1
5569,This is the number nd time we have tried numbe...,1
5570,Will b going to esplanade fr home?,0
5571,Pity was in mood for that. So any other sugges...,0
5572,The guy did some bitching but I acted like id ...,0
5573,Rofl. Its true to its name,0


In [5]:
# There are duplicated rows which may lead to data leakage
# shall drop them as well
print(df.duplicated().sum())
df.groupby('label').describe()

454


Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4827,4511,Sorry Ill call later,30
1,747,609,PRIVATE! Your number Account Statement for sho...,4


In [6]:
# Drop duplicate rows
df.drop_duplicates(inplace=True, ignore_index=True)
print(df.shape)
df.tail(10)

(5120, 2)


Unnamed: 0,text,label
5110,Ok lor Sony ericsson salesman I ask shuhui the...,0
5111,Ard number like dat lor.,0
5112,Why dont you wait til at least wednesday to se...,0
5113,Huh y lei,0
5114,REMINDER FROM O number : To get number pounds ...,1
5115,This is the number nd time we have tried numbe...,1
5116,Will b going to esplanade fr home?,0
5117,Pity was in mood for that. So any other sugges...,0
5118,The guy did some bitching but I acted like id ...,0
5119,Rofl. Its true to its name,0


In [7]:
df.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4511,4511,Go until jurong point crazy Available only in ...,1
1,609,609,Free entry in number a wkly comp to win FA Cup...,1


In [8]:
# Train, validate and test split
train, remainder = train_test_split(df, train_size=0.8, random_state=1)
validate, test = train_test_split(remainder, train_size=0.5, random_state=1)

In [9]:
# Reset index
train.reset_index(drop=True, inplace=True)
validate.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [10]:
# Save the data
train.to_csv('split-data//train.csv', index=False)
validate.to_csv('split-data//validate.csv', index=False)
test.to_csv('split-data//test.csv', index=False)

#### Add the first split to DVC

In [11]:
!dvc add split-data//train.csv
!dvc add split-data//validate.csv
!dvc add split-data//test.csv

\u280b Checking graph

\u280b Checking graph

\u280b Checking graph



In [12]:
!dvc status

Data and pipelines are up to date.


#### Modify the split with different random state

In [14]:
# Train, validate and test split
train, remainder = train_test_split(df, train_size=0.8, random_state=2)
validate, test = train_test_split(remainder, train_size=0.5, random_state=2)

In [15]:
# Reset index
train.reset_index(drop=True, inplace=True)
validate.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [16]:
# Save the data
train.to_csv('split-data//train.csv', index=False)
validate.to_csv('split-data//validate.csv', index=False)
test.to_csv('split-data//test.csv', index=False)

#### Add the second split to DVC

In [17]:
!dvc add split-data//train.csv
!dvc add split-data//validate.csv
!dvc add split-data//test.csv

\u280b Checking graph

\u280b Checking graph

\u280b Checking graph



In [18]:
!dvc status

Data and pipelines are up to date.


#### Revert to previous version

In [22]:
!git checkout HEAD~1
!dvc checkout

Note: switching to 'HEAD~1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 07be10f First version commit


#### Distribution of the label in the first version

In [27]:
print("First version:\n")
for split in ["train.csv", "validate.csv", "test.csv"]:
    df = pd.read_csv(f'split-data//{split}')
    count = df['label'].value_counts()
    print(f'Distribution of (0,1) in train set: ({count[0]},{count[1]})')

First version:

Distribution of (0,1) in train set: (3611,485)
Distribution of (0,1) in train set: (446,66)
Distribution of (0,1) in train set: (454,58)


#### Stash and then revert to main commit (second) as dvc updated the tmp/lock file, causing git to block the check out

In [30]:
!git stash
!git checkout main
!dvc checkout
!git stash pop

Saved working directory and index state WIP on (no branch): 07be10f First version commit
Your branch is up to date with 'origin/main'.


Previous HEAD position was 07be10f First version commit
Switched to branch 'main'


On branch main

error: Your local changes to the following files would be overwritten by merge:
	AppliedML/Assignment 2/.dvc/tmp/lock
Please commit your changes or stash them before you merge.
Aborting



Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .dvc/tmp/lock

no changes added to commit (use "git add" and/or "git commit -a")
The stash entry is kept in case you need it again.


#### Distribution of the label in the second version

In [31]:
print("Second version:\n")
for split in ["train.csv", "validate.csv", "test.csv"]:
    df = pd.read_csv(f'split-data//{split}')
    count = df['label'].value_counts()
    print(f'Distribution of (0,1) in train set: ({count[0]},{count[1]})')

Second version:

Distribution of (0,1) in train set: (3609,487)
Distribution of (0,1) in train set: (449,63)
Distribution of (0,1) in train set: (453,59)
