In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('./Dataset/spam.csv', encoding='latin1')
df2 = pd.read_csv('./Dataset/spam_dataset.csv', encoding='latin1')


In [3]:
df1.head()

Unnamed: 0,label,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df1.shape

(5572, 5)

In [5]:
df2.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
df2.shape

(5171, 4)

We have got 2 dataset of shape (5572,5) and (5171, 4):
- we will keep only the relevent column in both dataset which will be text and label and will drop rest columns.
- preprocess text: lower, stopword remove, steming, only alpha-numeric character.

In [7]:
# Droping irrelevent columns
df1.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df2.drop(columns=['Unnamed: 0', 'label_num'], inplace=True)

In [9]:
df1.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df2.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [14]:
df1.isnull().sum()
df2.isnull().sum()

label    0
text     0
dtype: int64

# Combining both dataset

In [18]:
combined_dataset = pd.concat([df1, df2], ignore_index=True)

In [19]:
combined_dataset.shape

(10743, 2)

In [20]:
combined_dataset.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocessing

In [16]:
from nltk.corpus import stopwords
import string
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [21]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    text = " ".join(y)

    return text.replace("subject ", "")

In [22]:
combined_dataset['text'] = combined_dataset['text'].apply(transform_text)

# Map labels to numerical values

In [23]:

combined_dataset['label'] = combined_dataset['label'].map({'ham': 0, 'spam': 1})

In [24]:
combined_dataset.head(5)

Unnamed: 0,label,text
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


In [26]:
combined_dataset.dropna(subset=['label'], inplace=True)

In [27]:
combined_dataset.shape

(10743, 2)

# Building model and training

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [30]:
# Split data into features and labels
X = combined_dataset['text']
y = combined_dataset['label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [32]:
# Train a Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [33]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9571893904141461
Precision: 0.9380733944954128
Recall: 0.8628691983122363
F1 Score: 0.8989010989010988


# Saving model and vectorizer for future use

In [34]:
import pickle
pickle.dump(vectorizer,open('./model/vectorizer.pkl','wb'))
pickle.dump(model,open('./model/model.pkl','wb'))

In [36]:
vectorizer = pickle.load(open('./model/vectorizer.pkl','rb'))
model = pickle.load(open('./model/model.pkl','rb'))

# Function to classify user input text
def classify_text(user_text):
    # Preprocess the input text
    processed_text = transform_text(user_text)
    
    # Vectorize the input text
    vectorized_text = vectorizer.transform([processed_text])
    
    # Predict the label
    prediction = model.predict(vectorized_text)
    
    # Return the result
    return 'spam' if prediction[0] == 1 else 'General (Non-Spam)'



In [37]:
# Example usage with user input
user_input = input("Enter a message to classify: ")
result = classify_text(user_input)
print(f'The message is classified as: {result}')

The message is classified as: General (Non-Spam)
