In [1]:
print("Hello World")

Hello World


In [5]:
import pandas as pd

# Reading the CSV file
data = pd.read_csv("combined_data.csv")

# Displaying the first few rows of the data
print(data.head())

   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...


In [7]:
# Check for missing values
print(data.isnull().sum())

label    0
text     0
dtype: int64


In [9]:
import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "This is an example email for spam classification."

# Tokenize the text using NLTK
tokens = word_tokenize(text)

print(tokens)


['This', 'is', 'an', 'example', 'email', 'for', 'spam', 'classification', '.']


In [1]:
from nltk.corpus import stopwords

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from the tokens
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print(filtered_tokens)


NameError: name 'tokens' is not defined

In [13]:
from nltk.stem import PorterStemmer

# Initialize a stemmer
stemmer = PorterStemmer()

# Stem each word
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

print(stemmed_tokens)


['exampl', 'email', 'spam', 'classif', '.']


In [15]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Initialize a lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each word
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print(lemmatized_tokens)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...


['example', 'email', 'spam', 'classification', '.']


In [17]:
# Assuming your dataset is loaded as a pandas DataFrame called 'data'
texts = data['text'].values

# Function to preprocess text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lemmatize the tokens (or you can use stemming)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the entire dataset
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Check the cleaned text
print(data[['text', 'cleaned_text']].head())


                                                text  \
0  ounce feather bowl hummingbird opec moment ala...   
1  wulvob get your medircations online qnb ikud v...   
2   computer connection from cnn com wednesday es...   
3  university degree obtain a prosperous future m...   
4  thanks for all your answers guys i know i shou...   

                                        cleaned_text  
0  ounce feather bowl hummingbird opec moment ala...  
1  wulvob get medircations online qnb ikud viagra...  
2  computer connection cnn com wednesday escapenu...  
3  university degree obtain prosperous future mon...  
4  thanks answer guy know checked rsync manual wo...  


In [19]:
# Use cleaned text for vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(data['cleaned_text'])

# Now you can proceed with training your model using X and your labels


In [21]:
from sklearn.model_selection import train_test_split

# Extract features and labels
X = data['cleaned_text']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 66758
Testing set size: 16690


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000)

# Fit on training data and transform
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Check the shape of the TF-IDF matrices
print(f"TF-IDF training set shape: {X_train_tfidf.shape}")
print(f"TF-IDF test set shape: {X_test_tfidf.shape}")


TF-IDF training set shape: (66758, 3000)
TF-IDF test set shape: (16690, 3000)


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)


Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      7938
           1       0.98      0.99      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690



In [63]:
# Define the preprocessing function
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Example input text
new_email = "Hello, just a reminder about our weekly team check-in tomorrow at 10 AM. See you then!"

# Preprocess the input text
preprocessed_email = preprocess_text(new_email)
print(f"Preprocessed Email: {preprocessed_email}")


Preprocessed Email: hello , reminder weekly team check-in tomorrow 10 . see !


In [65]:
# Vectorize the preprocessed email
new_email_tfidf = vectorizer.transform([preprocessed_email])

# Check the shape of the TF-IDF matrix
print(f"TF-IDF Vector Shape: {new_email_tfidf.shape}")


TF-IDF Vector Shape: (1, 3000)


In [67]:
# Make a prediction
prediction = model.predict(new_email_tfidf)

# Interpret the prediction
if prediction[0] == 1:
    print("The email is classified as SPAM.")
else:
    print("The email is classified as NOT SPAM.")


The email is classified as NOT SPAM.


In [69]:
import pickle

# Train your model and vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Assuming data is your DataFrame
X = data['cleaned_text']
y = data['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)

# Initialize and fit the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Save the model and vectorizer
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
