In [1]:
!dvc init -f

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

# Decouple compute and storage

In [3]:
!dvc remote add -d amlstorage gdrive://1zm71tPRrjEGLVeyPUIaxJ8ywhNaEtAk5

Setting 'amlstorage' as a default remote.
[0m

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download

# Download necessary NLTK datasets
download('punkt')
download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ujandasgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ujandasgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading Data

In [5]:
def load_data(file_path):
    return pd.read_csv(file_path)

# Preprocessing the Data

In [6]:
def preprocess_text(text):
    """
    Convert text to lowercase, remove non-alphabetic characters,
    and remove stopwords.

    Parameters:
    - text (str): The email text to preprocess.

    Returns:
    - str: The preprocessed email text.
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Re-join tokens into a single string
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

In [7]:
def preprocess_data(data):
    # Apply text preprocessing to the 'text' column 
    data['text'] = data['text'].apply(preprocess_text)
    return data

# Splitting the Data

In [8]:
def split_data(data, test_size=0.2, validation_size=0.25):
    # Splitting data into train and temp data (which will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=42)
    # Adjusting validation size based on the new size of temp_data
    validation_size_adjusted = validation_size / (1 - test_size)
    validation_data, test_data = train_test_split(temp_data, test_size=validation_size_adjusted, random_state=42)
    
    return train_data, validation_data, test_data

# Storing the Splits

In [10]:
def store_splits(train_data, validation_data, test_data, train_path='train.csv', validation_path='validation.csv', test_path='test.csv'):
    train_data.to_csv(train_path, index=False)
    validation_data.to_csv(validation_path, index=False)
    test_data.to_csv(test_path, index=False)

In [12]:
data = load_data('emails.csv')

In [15]:
data.to_csv('raw_data.csv', index=False)

In [16]:
preprocessed_data = preprocess_data(data)
train_data, validation_data, test_data = split_data(preprocessed_data)
store_splits(train_data, validation_data, test_data)

In [17]:
!dvc add raw_data.csv train.csv validation.csv test.csv

[?25l[32m⠋[0m Checking graph                                       core[39m>
  0% Adding...|                       | raw_data.csv |0/4 [00:00<?,     ?file/s]
![A
Collecting files and computing hashes in raw_data.csv |0.00 [00:00,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/Users/ujandasgupta/Desktop/Applied ML/assignment-2/.dvc[A
                                                                                [A
![A
  0%|          |Adding raw_data.csv to cache          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
 25% Adding...|█████▌                | train.csv |1/4 [00:00<00:01,  2.27file/s][A
![A
Collecting files and computing hashes in train.csv    |0.00 [00:00,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/Users/ujandasgupta/Desktop/

In [18]:
!git add .dvc/config

In [19]:
!git commit -m "Configuring Google Drive as DVC remote storage for AML"

[main (root-commit) fb5e502] Configuring Google Drive as DVC remote storage for AML
 3 files changed, 12 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore


In [20]:
!git add raw_data.csv.dvc train.csv.dvc validation.csv.dvc test.csv.dvc .gitignore

In [21]:
!git commit -m "Tracking data with DVC"

[main 0d90cfe] Tracking data with DVC
 5 files changed, 24 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 raw_data.csv.dvc
 create mode 100644 test.csv.dvc
 create mode 100644 train.csv.dvc
 create mode 100644 validation.csv.dvc


In [22]:
!dvc config core.autostage true

[0m

In [23]:
!dvc push

Collecting                                           |4.00 [00:00, 1.10kentry/s]
Pushing
![A
Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

Authentication successful.

                                                                                [A
![A
  0% Checking cache in '/Users/ujandasgupta/Desktop/Applied ML/assignment-2/.dvc[A
                                                                                [A
![A
  0%|          |Pushing to gdrive                     0/4 [00:00<?,     ?file/s][A
  0%|          |Pushing to gdrive                     0/4 [00:00<?,     ?file/s][A

![A[A

  0%|          |/Users/ujandasgupta/Desktop/Appl0.0

##  Update the splits with a different random seed

In [24]:
def split_data(data, test_size=0.2, validation_size=0.25):
    # Splitting data into train and temp data (which will be further split into validation and test)
    train_data, temp_data = train_test_split(data, test_size=test_size, random_state=25)
    # Adjusting validation size based on the new size of temp_data
    validation_size_adjusted = validation_size / (1 - test_size)
    validation_data, test_data = train_test_split(temp_data, test_size=validation_size_adjusted, random_state=42)
    
    return train_data, validation_data, test_data

## Save the updated splits

In [25]:
train_data, validation_data, test_data = split_data(preprocessed_data)
store_splits(train_data, validation_data, test_data)

### Add the updated datasets to DVC

In [26]:
!dvc add train.csv validation.csv test.csv

[?25l[32m⠋[0m Checking graph                                       core[39m>
  0% Adding...|                          | train.csv |0/3 [00:00<?,     ?file/s]
![A
Collecting files and computing hashes in train.csv    |0.00 [00:00,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/Users/ujandasgupta/Desktop/Applied ML/assignment-2/.dvc[A
                                                                                [A
![A
  0%|          |Adding train.csv to cache             0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /Users/ujandasgupta/Deskt0/1 [00:00<?,    ?files/s][A
  0% Adding...|                     | validation.csv |0/3 [00:00<?,     ?file/s][A
![A
Collecting files and computing hashes in validation.csv |0.00 [00:00,     ?file/[A
                                                         

In [27]:
!git add train.csv.dvc validation.csv.dvc test.csv.dvc


In [28]:
!git commit -m "Updating data splits with new random seed"

[main eacb60c] Updating data splits with new random seed
 3 files changed, 6 insertions(+), 6 deletions(-)


In [29]:
!dvc push

Collecting                                           |4.00 [00:00, 1.01kentry/s]
Pushing
![A
  0% Checking cache in '1zm71tPRrjEGLVeyPUIaxJ8ywhNaEtAk5/files/md5'| |0/? [00:0[A
  0% Querying cache in '1zm71tPRrjEGLVeyPUIaxJ8ywhNaEtAk5/files/md5'| |1/256 [00[A
                                                                                [A
![A
  0% Checking cache in '/Users/ujandasgupta/Desktop/Applied ML/assignment-2/.dvc[A
                                                                                [A
![A
  0%|          |Pushing to gdrive                     0/3 [00:00<?,     ?file/s][A
  0%|          |Pushing to gdrive                     0/3 [00:00<?,     ?file/s][A

![A[A

  0%|          |/Users/ujandasgupta/Desktop/Appl0.00/317k [00:00<?,        ?B/s][A[A

  3%|▎         |/Users/ujandasgupta/Desktop8.00k/317k [00:01<01:10,    4.47kB/s][A[A

                                                                                [A[A
 33%|███▎      |Pushing to gdrive  

In [30]:
!git log

[33mcommit eacb60c059e0635e0196f5f5fe81e5ac6129da1b[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m)[m
Author: Ujan Dasgupta <ujandasgupta@gmail.com>
Date:   Mon Feb 19 15:16:04 2024 +0530

    Updating data splits with new random seed

[33mcommit 0d90cfee9769685ae54d3b905ef235818dd8dd71[m
Author: Ujan Dasgupta <ujandasgupta@gmail.com>
Date:   Mon Feb 19 15:12:30 2024 +0530

    Tracking data with DVC

[33mcommit fb5e502842bcc0aaf32697cbe4c3486ecb3d6d86[m
Author: Ujan Dasgupta <ujandasgupta@gmail.com>
Date:   Mon Feb 19 15:11:52 2024 +0530

    Configuring Google Drive as DVC remote storage for AML


## <first_version_hash> = 0d90cfee9769685ae54d3b905ef235818dd8dd71

In [32]:
!git checkout 0d90cfee9769685ae54d3b905ef235818dd8dd71 train.csv.dvc validation.csv.dvc test.csv.dvc

Updated 3 paths from a6865c1


In [33]:
!dvc checkout

Building workspace index                              |4.00 [00:00,  475entry/s]
Comparing indexes                                    |5.00 [00:00, 6.92kentry/s]
Applying changes                                      |3.00 [00:00, 2.49kfile/s]
[33mM[0m       validation.csv
[33mM[0m       train.csv
[33mM[0m       test.csv
[0m

## Printing Distribution of the original data split

In [34]:
# List of data splits
splits = ['train', 'validation', 'test']

# Loop through each split and print the distribution
for split in splits:
    # Load the dataset from CSV file
    df = pd.read_csv(f'{split}.csv')
    # Get the distribution of the 'spam' column
    distribution = df['spam'].value_counts()
    # Print the distribution for the current split
    print(f"Distribution in {split}.csv: \n{distribution}\n")

Distribution in train.csv: 
spam
0    3504
1    1078
Name: count, dtype: int64

Distribution in validation.csv: 
spam
0    589
1    198
Name: count, dtype: int64

Distribution in test.csv: 
spam
0    267
1     92
Name: count, dtype: int64



## <updated_version_hash> = eacb60c059e0635e0196f5f5fe81e5ac6129da1b

In [35]:
!git checkout eacb60c059e0635e0196f5f5fe81e5ac6129da1b train.csv.dvc validation.csv.dvc test.csv.dvc

Updated 3 paths from 59b3e61


In [36]:
!dvc checkout

Building workspace index                              |4.00 [00:00,  456entry/s]
Comparing indexes                                    |5.00 [00:00, 7.40kentry/s]
Applying changes                                      |3.00 [00:00, 2.46kfile/s]
[33mM[0m       train.csv
[33mM[0m       test.csv
[33mM[0m       validation.csv
[0m

## Printing the distribution of the Updated data split

In [37]:
# List of data splits
splits = ['train', 'validation', 'test']

# Loop through each split and print the distribution
for split in splits:
    # Load the dataset from CSV file
    df = pd.read_csv(f'{split}.csv')
    # Get the distribution of the 'spam' column
    distribution = df['spam'].value_counts()
    # Print the distribution for the current split
    print(f"Distribution in {split}.csv: \n{distribution}\n")

Distribution in train.csv: 
spam
0    3495
1    1087
Name: count, dtype: int64

Distribution in validation.csv: 
spam
0    582
1    205
Name: count, dtype: int64

Distribution in test.csv: 
spam
0    283
1     76
Name: count, dtype: int64

