In [27]:
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
from tqdm import tqdm

# Load and preprocess the messages
messages = pd.read_csv('spam.csv', skiprows=1, names=['Label', 'Message'], usecols=[0, 1], encoding='ISO-8859-1')

# Initialize lemmatizer
lemma = WordNetLemmatizer()

# Preprocess corpus
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['Message'][i])  # Remove non-alphabet characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Split the sentence into words
    review = [lemma.lemmatize(word) for word in review]  # Lemmatize words
    review = ' '.join(review)  # Join words back into a sentence
    corpus.append(review)

# Tokenize sentences and words
words = []
sent_tokens = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    sent_tokens.append(sent_token)
    for sent in sent_token:
        words.append(simple_preprocess(sent))  # Tokenize and preprocess words

# Train Word2Vec model on the words
model = gensim.models.Word2Vec(words, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec_model.pkl")


# Define a function to compute the average Word2Vec for a document
def avg_word2vec(doc):
    word_vectors = np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
    if np.any(np.isnan(word_vectors)):  # Check if there are any valid word vectors
        return np.zeros(100)
    else:
        return word_vectors  # Return a valid vector of size 100 

    
# Initialize the feature matrix and label list
X = []
y = []  # To store the labels corresponding to each feature vector

# Generate feature vectors and store labels
for i in range(len(words)):
    feature_vector = avg_word2vec(words[i])
    X.append(feature_vector)
    y.append(messages['Label'][i])  # Store the corresponding label (ham/spam)

# Convert X to a NumPy array
X = np.vstack(X)  # Stack the valid feature vectors into a 2D array
# Convert y to a NumPy array or a Pandas Series
y = np.array(y)
# Chnanging ham/spam to 0s and 1s
y=pd.get_dummies(y)
y=y.iloc[:,0].values
y=y.astype(int)

## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

print(classification_report(y_test,y_pred))





  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0.8429084380610413
              precision    recall  f1-score   support

           0       0.19      0.05      0.07       153
           1       0.86      0.97      0.91       961

    accuracy                           0.84      1114
   macro avg       0.53      0.51      0.49      1114
weighted avg       0.77      0.84      0.80      1114



In [28]:
messages['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [29]:
# Function to preprocess a single message
def preprocess_message(message, model):
    # Preprocess the text
    review = re.sub('[^a-zA-Z]', ' ', message)  # Remove non-alphabet characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Split into words
    review = [lemma.lemmatize(word) for word in review]  # Lemmatize words
    
    # Tokenize and preprocess the sentence
    tokens = simple_preprocess(' '.join(review))
    
    # Compute the average Word2Vec vector
    feature_vector = avg_word2vec(tokens)
    return feature_vector

# Example: Load unseen data
unseen_messages = ["ham,Ok lar... Joking wif u oni", "Nah I don't think he goes to usf, he lives around here though", "Your account balance is low.","HI How are you"]

# Preprocess and compute feature vectors for unseen data
unseen_features = []
for message in unseen_messages:
    unseen_features.append(preprocess_message(message, model))

unseen_features = np.vstack(unseen_features)  # Convert to a 2D NumPy array

# Predict labels for unseen data
unseen_predictions = classifier.predict(unseen_features)

# Map predictions back to 'ham' or 'spam'
label_map = {0: "ham", 1: "spam"}
unseen_labels = [label_map[pred] for pred in unseen_predictions]

# Print predictions
for i, message in enumerate(unseen_messages):
    print(f"Message: '{message}' -> Predicted Label: {unseen_labels[i]}")


Message: 'ham,Ok lar... Joking wif u oni' -> Predicted Label: spam
Message: 'Nah I don't think he goes to usf, he lives around here though' -> Predicted Label: spam
Message: 'Your account balance is low.' -> Predicted Label: spam
Message: 'HI How are you' -> Predicted Label: spam


In [30]:
##Implementing with xgboost for better performance

In [33]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Assuming `X` and `y` are prepared

# Oversample class 0 using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the XGBoost model with tuned hyperparameters
xgb_classifier = XGBClassifier(
    scale_pos_weight=5,  # Manually set for better class 0 handling
    max_depth=6,
    learning_rate=0.1,
    n_estimators=500,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)
xgb_classifier.fit(X_train, y_train)
xgb_classifier.save_model("xgboost_model.json")


# Predictions
y_pred = xgb_classifier.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8900984966303784
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.86      0.88       942
           1       0.87      0.92      0.90       987

    accuracy                           0.89      1929
   macro avg       0.89      0.89      0.89      1929
weighted avg       0.89      0.89      0.89      1929



In [34]:
# Function to preprocess a single message
def preprocess_message(message, model):
    # Preprocess the text
    review = re.sub('[^a-zA-Z]', ' ', message)  # Remove non-alphabet characters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Split into words
    review = [lemma.lemmatize(word) for word in review]  # Lemmatize words
    
    # Tokenize and preprocess the sentence
    tokens = simple_preprocess(' '.join(review))
    
    # Compute the average Word2Vec vector
    feature_vector = avg_word2vec(tokens)
    return feature_vector

# Example: Load unseen data
unseen_messages = ["ham,Ok lar... Joking wif u oni",'Is that seriously how you spell his name', "Nah I don't think he goes to usf, he lives around here though", "Your account balance is low.","HI How are you"]

# Preprocess and compute feature vectors for unseen data
unseen_features = []
for message in unseen_messages:
    unseen_features.append(preprocess_message(message, model))

unseen_features = np.vstack(unseen_features)  # Convert to a 2D NumPy array

# Predict labels for unseen data
unseen_predictions = xgb_classifier.predict(unseen_features)

# Map predictions back to 'ham' or 'spam'
label_map = {0: "ham", 1: "spam"}
unseen_labels = [label_map[pred] for pred in unseen_predictions]

# Print predictions
for i, message in enumerate(unseen_messages):
    print(f"Message: '{message}' -> Predicted Label: {unseen_labels[i]}")


Message: 'ham,Ok lar... Joking wif u oni' -> Predicted Label: spam
Message: 'Is that seriously how you spell his name' -> Predicted Label: spam
Message: 'Nah I don't think he goes to usf, he lives around here though' -> Predicted Label: spam
Message: 'Your account balance is low.' -> Predicted Label: spam
Message: 'HI How are you' -> Predicted Label: spam
