In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
import string

In [2]:
import gensim.downloader as api

# Load the pre-trained Google News Word2Vec model
model = api.load("word2vec-google-news-300")



In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
#function for text processing
def preprocess_message(message):
    stop_words = set(stopwords.words('english'))
    # Tokenize the message
    tokens = word_tokenize(message)
    # Remove punctuation and lowercase everything
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [11]:
#messages to word vectors
def message_to_vec(message, model):
    tokens = preprocess_message(message)
    vectors = []
    for word in tokens:
        if word in model:
            vectors.append(model[word])
    if len(vectors) == 0:
        # If no word found in the vocabulary, return a zero vector
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [12]:
df = pd.read_csv("spam.csv", encoding="latin-1")

In [13]:
df = df[['v1', 'v2']]  # 'v1' (Label) and 'v2' (Message)
df.columns = ['Label', 'Message']

In [14]:
X = np.array([message_to_vec(msg, model) for msg in df['Message']])
y = df['Label'].map({'ham': 0, 'spam': 1})  # Map labels to integers (ham = 0, spam = 1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [17]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 94.17%


In [18]:
def predict_message_class(model, w2v_model, message):
    # Convert the message to a vector
    message_vec = message_to_vec(message, w2v_model)
    # Predict using the trained model
    prediction = model.predict([message_vec])
    return 'spam' if prediction == 1 else 'ham'

In [25]:
#test message string
test_message = "Message 'yes' to win!'"
predicted_class = predict_message_class(classifier, model, test_message)
print(f"Predicted class: {predicted_class}")

Predicted class: spam
