<a href="https://colab.research.google.com/github/vibekrana/SPAM/blob/main/Spam_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries from the NLTK toolkit
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Downloads tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Additional support for tokenization
nltk.download('stopwords')  # Downloads predefined stopword lists for various languages

# Downloading the dataset from Kaggle using Kaggle CLI
# Here, we download a dataset containing spam emails
!kaggle datasets download -d abdallahwagih/spam-emails  # Downloads the spam emails dataset
!unzip spam-emails.zip  # Extracts the downloaded dataset

# Importing pandas for data manipulation
import pandas as pd

# Loading the dataset into a pandas DataFrame
# The CSV file contains columns like 'Message' (email text) and 'Label' (spam/not spam indicator)
df = pd.read_csv("spam.csv")

# Step to clean the text data
# - Removing punctuation, special characters, and multiple spaces
# - Preparing the data for tokenization and further processing
import re  # Regular expressions module for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Message']:  # Iterating over each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation and special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Removing leading and trailing spaces
    cleaned.append(cleaned_data)  # Appending the cleaned text to the list

# Tokenizing the cleaned text into words
# Each cleaned text is split into a list of individual words for further analysis
tokens = [word_tokenize(x) for x in cleaned]

# Removing stopwords from the tokenized words
# Stopwords are common words (e.g., "the", "is") that do not contribute much to analysis
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stop_token = []  # List to store tokens after removing stopwords
for k in range(len(df['Message'])):  # Iterating through tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filtering out stopwords
    stop_token.append(p)  # Adding filtered tokens to the list

# Applying stemming to reduce words to their root form
# This helps group similar words (e.g., "running", "runner" -> "run")
from nltk.stem import PorterStemmer
ps = PorterStemmer()  # Initializing the Porter Stemmer
stemedata = []  # List to store stemmed data
for message in stop_token:  # Iterating over tokens after stopword removal
    st = [ps.stem(word) for word in message]  # Applying stemming to each word
    stemedata.append(st)  # Adding the stemmed tokens to the list

# Summary of steps:
# 1. Necessary libraries and NLTK datasets are imported/downloaded.
# 2. A spam email dataset is downloaded from Kaggle and loaded into a pandas DataFrame.
# 3. The 'Message' column is cleaned by removing punctuation, special characters, and extra spaces.
# 4. The cleaned text is tokenized into individual words.
# 5. Stopwords are removed from the tokenized words to reduce noise.
# 6. Stemming is applied to group similar words to their root form.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
Downloading spam-emails.zip to /content
  0% 0.00/207k [00:00<?, ?B/s]
100% 207k/207k [00:00<00:00, 11.2MB/s]
Archive:  spam-emails.zip
  inflating: spam.csv                


In [6]:
#apply pos tags on stop_token
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng') #package which will help in doing pos tagging
pos_tokens= [pos_tag(message) for message in stop_token]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [7]:
pos_tokens

[[('Go', 'VB'),
  ('jurong', 'JJ'),
  ('point', 'NN'),
  ('crazy', 'NN'),
  ('Available', 'NNP'),
  ('bugis', 'NN'),
  ('n', 'RB'),
  ('great', 'JJ'),
  ('world', 'NN'),
  ('la', 'NN'),
  ('e', 'VBP'),
  ('buffet', 'JJ'),
  ('Cine', 'NNP'),
  ('got', 'VBD'),
  ('amore', 'RB'),
  ('wat', 'JJ')],
 [('Ok', 'NNP'),
  ('lar', 'JJ'),
  ('Joking', 'NNP'),
  ('wif', 'NN'),
  ('u', 'NN'),
  ('oni', 'NN')],
 [('Free', 'JJ'),
  ('entry', 'NN'),
  ('2', 'CD'),
  ('wkly', 'JJ'),
  ('comp', 'NN'),
  ('win', 'VBP'),
  ('FA', 'NNP'),
  ('Cup', 'NNP'),
  ('final', 'JJ'),
  ('tkts', 'NN'),
  ('21st', 'CD'),
  ('May', 'NNP'),
  ('2005', 'CD'),
  ('Text', 'NNP'),
  ('FA', 'NNP'),
  ('87121', 'CD'),
  ('receive', 'JJ'),
  ('entry', 'NN'),
  ('questionstd', 'NN'),
  ('txt', 'NN'),
  ('rateTCs', 'NN'),
  ('apply', 'VBP'),
  ('08452810075over18s', 'CD')],
 [('U', 'JJ'),
  ('dun', 'NNS'),
  ('say', 'VBP'),
  ('early', 'JJ'),
  ('hor', 'NN'),
  ('U', 'NNP'),
  ('c', 'NN'),
  ('already', 'RB'),
  ('say', 'VB')],

In [8]:
#now lets apply lemmeatization on the data
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
lm = WordNetLemmatizer()

In [18]:
pos_tokens[0][0][0] #first 0 for message second is for a single word third is for getting word from pos tag

'Go'

In [19]:
lm.lemmatize(pos_tokens[0][0][0],'v')

'Go'

In [12]:
# Function to convert nltk's POS tags to WordNet's POS tags
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun if

In [13]:
get_wordnet_pos('VB')

'v'

In [20]:
pos_tokens[0][0][1]

'VB'

In [21]:
get_wordnet_pos(pos_tokens[0][0][1]) #replace this code with 'v'

'v'

In [23]:
#lemmetizing data using for loop as x as message and y as singe word
empty = []
for x in range(len(pos_tokens)):
  lem = [lm.lemmatize(pos_tokens[x][y][0],get_wordnet_pos(pos_tokens[x][y][1])) for y in range(len(pos_tokens[x]))] #x is the message and y is one single word
  empty.append(lem)


In [33]:
pos_tokens[9]

[('Had', 'NNP'),
 ('mobile', 'CC'),
 ('11', 'CD'),
 ('months', 'NNS'),
 ('U', 'NNP'),
 ('R', 'NNP'),
 ('entitled', 'VBD'),
 ('Update', 'NNP'),
 ('latest', 'JJS'),
 ('colour', 'NN'),
 ('mobiles', 'NNS'),
 ('camera', 'VBP'),
 ('Free', 'JJ'),
 ('Call', 'PDT'),
 ('The', 'DT'),
 ('Mobile', 'NNP'),
 ('Update', 'NNP'),
 ('Co', 'NNP'),
 ('FREE', 'NNP'),
 ('08002986030', 'CD')]

In [32]:
empty[9]

['Had',
 'mobile',
 '11',
 'month',
 'U',
 'R',
 'entitle',
 'Update',
 'late',
 'colour',
 'mobile',
 'camera',
 'Free',
 'Call',
 'The',
 'Mobile',
 'Update',
 'Co',
 'FREE',
 '08002986030']