In [1]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


In [2]:

# Download necessary resources from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load the dataset
file_path = "C:/Users/DELL/Desktop/sms+spam+collection/SMSSpamCollection"  # Path to your uploaded file
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

In [4]:
# Display the first few rows of the dataset
print(data.head())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:

# Preprocessing functions
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    return " ".join(tokens)


In [7]:

# Apply preprocessing to the 'message' column
data['cleaned_message'] = data['message'].apply(preprocess_text)



In [8]:
data.head()

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point , crazi .. avail bugi n great ..."
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah n't think goe usf , live around though"


In [10]:
# 1. Bag of Words (BOW)
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(data['cleaned_message'])

# You can convert these sparse matrices to arrays or DataFrames if needed
X_bow_df = pd.DataFrame(X_bow.toarray(), columns=bow_vectorizer.get_feature_names_out())

# Display the first few rows of the resulting feature matrices
print("Bag of Words (BOW) Features:")
print(X_bow_df.head())





Bag of Words (BOW) Features:
   00  000  000pe  008704050406  0089  0121  01223585236  01223585334  \
0   0    0      0             0     0     0            0            0   
1   0    0      0             0     0     0            0            0   
2   0    0      0             0     0     0            0            0   
3   0    0      0             0     0     0            0            0   
4   0    0      0             0     0     0            0            0   

   0125698789  02  ...  zhong  zindgi  zoe  zogtoriu  zoom  zouk  zyada  èn  \
0           0   0  ...      0       0    0         0     0     0      0   0   
1           0   0  ...      0       0    0         0     0     0      0   0   
2           0   0  ...      0       0    0         0     0     0      0   0   
3           0   0  ...      0       0    0         0     0     0      0   0   
4           0   0  ...      0       0    0         0     0     0      0   0   

   ú1  〨ud  
0   0    0  
1   0    0  
2   0    0  
3   0

In [12]:

# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_message'])

# You can convert these sparse matrices to arrays or DataFrames if needed
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the resulting feature matrices
print("\nTF-IDF Features:")
print(X_tfidf_df.head())



TF-IDF Features:
    00  000  000pe  008704050406  0089  0121  01223585236  01223585334  \
0  0.0  0.0    0.0           0.0   0.0   0.0          0.0          0.0   
1  0.0  0.0    0.0           0.0   0.0   0.0          0.0          0.0   
2  0.0  0.0    0.0           0.0   0.0   0.0          0.0          0.0   
3  0.0  0.0    0.0           0.0   0.0   0.0          0.0          0.0   
4  0.0  0.0    0.0           0.0   0.0   0.0          0.0          0.0   

   0125698789   02  ...  zhong  zindgi  zoe  zogtoriu  zoom  zouk  zyada   èn  \
0         0.0  0.0  ...    0.0     0.0  0.0       0.0   0.0   0.0    0.0  0.0   
1         0.0  0.0  ...    0.0     0.0  0.0       0.0   0.0   0.0    0.0  0.0   
2         0.0  0.0  ...    0.0     0.0  0.0       0.0   0.0   0.0    0.0  0.0   
3         0.0  0.0  ...    0.0     0.0  0.0       0.0   0.0   0.0    0.0  0.0   
4         0.0  0.0  ...    0.0     0.0  0.0       0.0   0.0   0.0    0.0  0.0   

    ú1  〨ud  
0  0.0  0.0  
1  0.0  0.0  
2  0.0  

In [13]:
# 3. N-grams (example: using bi-grams and tri-grams)
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))
X_ngram = ngram_vectorizer.fit_transform(data['cleaned_message'])

# You can convert these sparse matrices to arrays or DataFrames if needed
X_ngram_df = pd.DataFrame(X_ngram.toarray(), columns=ngram_vectorizer.get_feature_names_out())
                               
# Display the first few rows of the resulting feature matrices
print("\nN-grams Features:")
print(X_ngram_df.head())

          

                                         


N-grams Features:
   00  00 easter  00 easter prize  00 per  00 sub  00 sub 16  000  000 bonu  \
0   0          0                0       0       0          0    0         0   
1   0          0                0       0       0          0    0         0   
2   0          0                0       0       0          0    0         0   
3   0          0                0       0       0          0    0         0   
4   0          0                0       0       0          0    0         0   

   000 bonu caller  000 cash  ...  zouk nichol paris  zyada  zyada kisi  \
0                0         0  ...                  0      0           0   
1                0         0  ...                  0      0           0   
2                0         0  ...                  0      0           0   
3                0         0  ...                  0      0           0   
4                0         0  ...                  0      0           0   

   zyada kisi ko  èn  ú1  ú1 20  ú1 20 poboxox36504w45w