In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
df = pd.read_csv("C:/Users/Nagababu/Downloads/spamdata.csv.csv", encoding="latin-1")


In [55]:
# Keep only useful columns
# Renaming columns
df = df.rename(columns={'v1':'label', 'v2':'text'})
print(df)

     label                                               text  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   
...    ...                                                ...   
5567  spam  This is the 2nd time we have tried 2 contact u...   
5568   ham              Will Ì_ b going to esplanade fr home?   
5569   ham  Pity, * was in mood for that. So...any other s...   
5570   ham  The guy did some bitching but I acted like i'd...   
5571   ham                         Rofl. Its true to its name   

                                             clean_text  \
0     go until jurong point crazy available only in ...   
1                               ok lar joking wif u oni   
2     free entry in a wkly comp to win fa 

In [56]:
# 2. Handle Missing Values
# -------------------------------
df.dropna(inplace=True)
print(df)

     label                                               text  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   
...    ...                                                ...   
5567  spam  This is the 2nd time we have tried 2 contact u...   
5568   ham              Will Ì_ b going to esplanade fr home?   
5569   ham  Pity, * was in mood for that. So...any other s...   
5570   ham  The guy did some bitching but I acted like i'd...   
5571   ham                         Rofl. Its true to its name   

                                             clean_text  \
0     go until jurong point crazy available only in ...   
1                               ok lar joking wif u oni   
2     free entry in a wkly comp to win fa 

In [35]:
# 3. Text Cleaning Function
# -------------------------------
def clean_text(text):
    text = text.lower()  # lowercasing
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df['clean_text'] = df['text'].apply(clean_text)
print(df['clean_text'])

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in a wkly comp to win fa cup final ...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the nd time we have tried contact u u ...
5568                 will ì_ b going to esplanade fr home
5569    pity was in mood for that soany other suggestions
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: clean_text, Length: 5572, dtype: object


In [33]:
# 4. Tokenization
# -------------------------------
df['tokens'] = df['clean_text'].apply(word_tokenize)
print(df['tokens'])

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, a, wkly, comp, to, win, fa, ...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
                              ...                        
5567    [this, is, the, nd, time, we, have, tried, con...
5568        [will, ì_, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
Name: tokens, Length: 5572, dtype: object


In [33]:

def df():

# 4. Tokenization
     df['tokens'] = df['text'].apply(word_tokenize)

# 5. Initialize Stemmer
     stemmer = PorterStemmer()

# 6. Apply Stemming
     df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

# 7. Print result
     print(df.head())
    

In [4]:
# 7. Lemmatization
# -------------------------------
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df['lemmatized'])


0       [Go, until, jurong, point, ,, crazy, .., Avail...
1                [Ok, lar, ..., Joking, wif, u, oni, ...]
2       [Free, entry, in, 2, a, wkly, comp, to, win, F...
3       [U, dun, say, so, early, hor, ..., U, c, alrea...
4       [Nah, I, do, n't, think, he, go, to, usf, ,, h...
                              ...                        
5567    [This, is, the, 2nd, time, we, have, tried, 2,...
5568     [Will, Ì_, b, going, to, esplanade, fr, home, ?]
5569    [Pity, ,, *, wa, in, mood, for, that, ., So, ....
5570    [The, guy, did, some, bitching, but, I, acted,...
5571                   [Rofl, ., Its, true, to, it, name]
Name: lemmatized, Length: 5572, dtype: object


In [5]:
# -------------------------------
# 8. Join Tokens Back to Text
# -------------------------------
df['final_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))
print(df['final_text'])

0       Go until jurong point , crazy .. Available onl...
1                         Ok lar ... Joking wif u oni ...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor ... U c already then sa...
4       Nah I do n't think he go to usf , he life arou...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568               Will Ì_ b going to esplanade fr home ?
5569    Pity , * wa in mood for that . So ... any othe...
5570    The guy did some bitching but I acted like i '...
5571                           Rofl . Its true to it name
Name: final_text, Length: 5572, dtype: object


In [None]:
# -------------------------------
# 9. Bag of Words & TF-IDF
# -------------------------------
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
X_bow = cv.fit_transform(df['final_text'])

# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['final_text'])

# -------------------------------
# Final Output
# -------------------------------
print("Shape of Bag of Words Matrix:", X_bow.shape)
print("Shape of TF-IDF Matrix:", X_tfidf.shape)


NameError: name 'CountVectorizer' is not defined

In [32]:

# Tokenization on 'message' column
df['tokens'] = df['message'].apply(lambda x: word_tokenize(str(x)))

# Show first 5 rows
print(df.head())

# Example tokenized message
print("Example tokenized message:", df['tokens'][0])


  label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                              tokens  \
0  [Go, until, jurong, point, ,, crazy, .., Avail...   
1           [Ok, lar, ..., Joking, wif, u, oni, ...]   
2  [Free, entry, in, 2, a, wkly, comp, to, win, F...   
3  [U, dun, say, so, early, hor, ..., U, c, alrea...   
4  [Nah, I, do, n't, think, he, goes, to, usf, ,,...   

                                          lemmatized  
0  [Go, until, jurong, point, ,, crazy, .., Avail...  
1           [Ok, lar, ..., Joking, wif, u, oni, ...]  
2  [Free, entry, in, 2, a, wkly, comp, to, win, F...  
3  [U, dun, say, so, early, hor, ..., U, c, alrea...  
4  [Nah, I, do,

In [2]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required resources (only first time)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("spamdata.csv.csv", encoding="latin-1")

# Keep only the important columns (v1 = label, v2 = message)
df = df[['v1', 'v2']]
df.columns = ['label', 'message']  # rename columns

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove unwanted characters (URLs, numbers, punctuation, special chars)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # URLs
    text = re.sub(r"<.*?>", "", text)  # HTML tags
    text = re.sub(r"[^a-z\s]", "", text)  # keep only alphabets
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    
    # Apply lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    
    return lemmatized

# Apply preprocessing
df['processed_tokens'] = df['message'].apply(preprocess_text)

# Show results
print(df.head())

# Example: first processed message
print("Example processed tokens:", df['processed_tokens'][0])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nagababu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nagababu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nagababu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                    processed_tokens  
0  [go, jurong, point, crazi, avail, bugi, n, gre...  
1                       [ok, lar, joke, wif, u, oni]  
2  [free, entri, wkli, comp, win, fa, cup, final,...  
3      [u, dun, say, earli, hor, u, c, alreadi, say]  
4  [nah, dont, think, goe, usf, live, around, tho...  
Example processed tokens: ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']


In [38]:

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove unwanted characters (URLs, HTML tags, numbers, punctuation)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # URLs
    text = re.sub(r"<.*?>", "", text)  # HTML tags
    text = re.sub(r"[^a-z\s]", "", text)  # keep only alphabets
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    
    # Apply lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    
    return " ".join(lemmatized)  # return string (needed for vectorizers)

# Apply preprocessing
df['clean_text'] = df['message'].apply(preprocess_text)

# -----------------------------
# Bag of Words (Count Vectorizer)
# -----------------------------
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(df['clean_text'])

# -----------------------------
# TF-IDF Vectorization
# -----------------------------
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])

# Show shapes of matrices
print("Bag of Words shape:", X_bow.shape)
print("TF-IDF shape:", X_tfidf.shape)

# Example features
print("Sample BoW features:", count_vectorizer.get_feature_names_out()[:20])
print("Sample TF-IDF features:", tfidf_vectorizer.get_feature_names_out()[:20])


Bag of Words shape: (5572, 6963)
TF-IDF shape: (5572, 6963)
Sample BoW features: ['aa' 'aah' 'aaniy' 'aaooooright' 'aathilov' 'aathiwher' 'ab' 'abbey'
 'abdomen' 'abeg' 'abel' 'aberdeen' 'abi' 'abil' 'abiola' 'abj' 'abl'
 'abnorm' 'abouta' 'abroad']
Sample TF-IDF features: ['aa' 'aah' 'aaniy' 'aaooooright' 'aathilov' 'aathiwher' 'ab' 'abbey'
 'abdomen' 'abeg' 'abel' 'aberdeen' 'abi' 'abil' 'abiola' 'abj' 'abl'
 'abnorm' 'abouta' 'abroad']


In [3]:

# Tokenization on 'message' column
df['tokens'] = df['message'].apply(lambda x: word_tokenize(str(x)))

# Show first 5 rows
print(df.head())

# Example tokenized message
print("Example tokenized message:", df['tokens'][0])


  label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                    processed_tokens  \
0  [go, jurong, point, crazi, avail, bugi, n, gre...   
1                       [ok, lar, joke, wif, u, oni]   
2  [free, entri, wkli, comp, win, fa, cup, final,...   
3      [u, dun, say, earli, hor, u, c, alreadi, say]   
4  [nah, dont, think, goe, usf, live, around, tho...   

                                              tokens  
0  [Go, until, jurong, point, ,, crazy, .., Avail...  
1           [Ok, lar, ..., Joking, wif, u, oni, ...]  
2  [Free, entry, in, 2, a, wkly, comp, to, win, F...  
3  [U, dun, say, so, early, hor, ..., U, c, alrea...  
4  [Nah, I, do,