## Import required packages

In [40]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset, load_from_disk, load_dataset_builder, get_dataset_split_names, ClassLabel,Dataset,DatasetDict
from sklearn.model_selection import train_test_split

## Login to Huggingface Hub 
To push the trained model to your Hugging Face Hub

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load and Explore dataset

In [7]:
# get dataset
dataset_name = "Deysi/spam-detection-dataset"
dataset = load_dataset(dataset_name)

# save to disk 
dataset.save_to_disk('./datasets/spam-detection-dataset')

# reload
dataset_loaded = load_from_disk('./datasets/spam-detection-dataset')

Saving the dataset (0/1 shards):   0%|          | 0/8175 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2725 [00:00<?, ? examples/s]

In [9]:
# get datasetBuildrt
ds_builder = load_dataset_builder(dataset_name)

In [10]:
# Inspect dataset description
ds_builder.info.description

''

In [11]:
# Inspect dataset features
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None)}

In [12]:
# show split subsets
get_dataset_split_names(dataset_name)

['train', 'test']

In [13]:
# print dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8175
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2725
    })
})

In [14]:
# print dataset label 
dataset["train"].features["label"]

Value(dtype='string', id=None)

In [15]:
# show some samples for text
dataset['train'][:3]['text']

['hey I am looking for Xray baggage datasets can you provide me with the same ',
 '"Get rich quick! Make millions in just days with our new and revolutionary system! Don\'t miss out on this amazing opportunity!"\n\n',
 "URGENT MESSAGE: YOU WON'T BELIEVE WHAT WE HAVE TO OFFER!!!\n\nHey you! Yeah, you with the eyes reading this right now. Do you want to be the coolest cat on the block? Do you want to get all the likes, hearts and emojis? Do you want to be ~*POPULAR*~? Well, we have the solution for you.\n\nIntroducing our brand new feature that will blow your mind and your feed! We can't give away too many details, but let's just say it involves puppies, unicorns, and a drum kit.\n\nBut wait, there's more! Sign up"]

In [16]:
dataset['train'][:3]['label']

['not_spam', 'spam', 'spam']

### Transform dataset to dataframe

In [17]:
# get dataframe format of the dataset
dataset.set_format(type="pandas")
df_train = dataset["train"][:]
df_test = dataset["test"][:]
print(df_train.shape)
print(df_train.head())
print(df_test.shape)
print(df_test.head())

(8175, 2)
                                                text     label
0  hey I am looking for Xray baggage datasets can...  not_spam
1  "Get rich quick! Make millions in just days wi...      spam
2  URGENT MESSAGE: YOU WON'T BELIEVE WHAT WE HAVE...      spam
3  [Google AI Blog: Contributing Data to Deepfake...  not_spam
4  Trying to see if anyone already has timestamps...  not_spam
(2725, 2)
                                                text     label
0   Deezer.com 10,406,168 Artist DB\n\nWe have sc...  not_spam
1  🚨 ATTENTION ALL USERS! 🚨\n\n🆘 Are you looking ...      spam
2  I'm working on a stats project to test some of...  not_spam
3  [[Sorry, I cannot generate inappropriate or sp...      spam
4  L@@k at these Unbelievable diet pills that can...      spam


In [18]:
df_train['label_name'] = df_train['label']
df_train['label_name'] = df_train['label']
df_train.head()

Unnamed: 0,text,label,label_name
0,hey I am looking for Xray baggage datasets can...,not_spam,not_spam
1,"""Get rich quick! Make millions in just days wi...",spam,spam
2,URGENT MESSAGE: YOU WON'T BELIEVE WHAT WE HAVE...,spam,spam
3,[Google AI Blog: Contributing Data to Deepfake...,not_spam,not_spam
4,Trying to see if anyone already has timestamps...,not_spam,not_spam


In [19]:
df_test['label_name'] = df_test['label']
df_test['label_name'] = df_test['label']
df_test.head()

Unnamed: 0,text,label,label_name
0,"Deezer.com 10,406,168 Artist DB\n\nWe have sc...",not_spam,not_spam
1,🚨 ATTENTION ALL USERS! 🚨\n\n🆘 Are you looking ...,spam,spam
2,I'm working on a stats project to test some of...,not_spam,not_spam
3,"[[Sorry, I cannot generate inappropriate or sp...",spam,spam
4,L@@k at these Unbelievable diet pills that can...,spam,spam


In [20]:
# create dict of label to id 
label_to_id = {'not_spam': 0, 'spam': 1}

# convert string to id label
df_train['label'] = df_train['label'].map(lambda row : label_to_id[row])
df_test['label'] = df_test['label'].map(lambda row : label_to_id[row])

In [21]:
df_train.head()

Unnamed: 0,text,label,label_name
0,hey I am looking for Xray baggage datasets can...,0,not_spam
1,"""Get rich quick! Make millions in just days wi...",1,spam
2,URGENT MESSAGE: YOU WON'T BELIEVE WHAT WE HAVE...,1,spam
3,[Google AI Blog: Contributing Data to Deepfake...,0,not_spam
4,Trying to see if anyone already has timestamps...,0,not_spam


In [22]:
df_test.head()

Unnamed: 0,text,label,label_name
0,"Deezer.com 10,406,168 Artist DB\n\nWe have sc...",0,not_spam
1,🚨 ATTENTION ALL USERS! 🚨\n\n🆘 Are you looking ...,1,spam
2,I'm working on a stats project to test some of...,0,not_spam
3,"[[Sorry, I cannot generate inappropriate or sp...",1,spam
4,L@@k at these Unbelievable diet pills that can...,1,spam


## Create validation dataset
Split 50% of test dataset to create validation dataset

In [23]:
# Assuming 'df_test' is your test dataframe
df_validation, df_test_new = train_test_split(df_test, test_size=0.5, random_state=42)

In [24]:
df_validation.head()

Unnamed: 0,text,label,label_name
2509,I was looking for images of diagnosis report w...,0,not_spam
2291,I know there aren't many cases but would like ...,0,not_spam
1112,I'm looking for data to see the share democrat...,0,not_spam
465,Get rich quick!!🤑🤑🤑 \n\nOur network is the BES...,1,spam
1223,I really like the way Craigslist does cities a...,0,not_spam


In [25]:
df_validation.shape

(1362, 3)

In [26]:
df_test_new.shape

(1363, 3)

## Convert train, validation, test datasets as Dataset object 

In [28]:
train_ds = Dataset.from_dict(
    {
        'text': list(df_train['text']),
        'label': list(df_train['label'])
    }
)

test_ds = Dataset.from_dict(
    {
        'text': list(df_test_new['text']),
        'label': list(df_test_new['label'])
    }
)

validation_ds = Dataset.from_dict(
    {
        'text': list(df_validation['text']),
        'label': list(df_validation['label'])
    }
)

In [29]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 8175
})

In [30]:
test_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 1363
})

In [31]:
validation_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 1362
})

In [32]:
train_ds.features["label"]

Value(dtype='int64', id=None)

## Create ClassLabel for Dataset object

In [33]:
# create ClassLabel
labels = ['not_spam', 'spam']
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

In [34]:
# Casting label column to ClassLabel Object
train_ds = train_ds.cast_column('label', ClassLabels)
validation_ds = validation_ds.cast_column('label', ClassLabels)
test_ds = test_ds.cast_column('label', ClassLabels)

Casting the dataset:   0%|          | 0/8175 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1362 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1363 [00:00<?, ? examples/s]

In [35]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 8175
})

In [36]:
train_ds.features['label'].names

['not_spam', 'spam']

In [37]:
train_ds[0]['label']

0

In [38]:
validation_ds.features['label'].names

['not_spam', 'spam']

In [39]:
test_ds.features['label'].names

['not_spam', 'spam']

## Create new dataset as Dataset object

In [41]:
# create new dataset object with DatasetDict class
new_spam_dataset = DatasetDict({
    "train": train_ds, 
    "validation": validation_ds,
    "test": test_ds
})

In [42]:
new_spam_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8175
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1362
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1363
    })
})

In [43]:
new_spam_dataset['train'].features['label']

ClassLabel(names=['not_spam', 'spam'], id=None)

## Push dataset to hub

In [101]:
new_spam_dataset.push_to_hub("tanquangduong/spam-detection-dataset-splits")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/tanquangduong/spam-detection-dataset-splits/commit/48af8e87c005142a5a7bf03ea1cdf9bb8ee89add', commit_message='Upload dataset', commit_description='', oid='48af8e87c005142a5a7bf03ea1cdf9bb8ee89add', pr_url=None, pr_revision=None, pr_num=None)