In [2]:
# Importing necessary libraries
import pandas as pd

# Loading the dataset
data = pd.read_csv('textspamdata.csv', encoding='latin-1')  # Ensure the file is in the same directory or provide full path

# Inspecting the first few rows of the dataset to verify its structure
print("Dataset head:\n", data.head())
print("\nColumns:\n", data.columns)
print("\nData types:\n", data.dtypes)

Dataset head:
      v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Columns:
 Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

Data types:
 v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object


In [3]:
# Dropping unnecessary columns
data = data[['v1', 'v2']]

# Renaming columns: 'v1' to 'label' and 'v2' to 'message'
data = data.rename(columns={'v1': 'label', 'v2': 'message'})

# Verifying the changes
print("Cleaned dataset head:\n", data.head())
print("\nColumns after renaming:\n", data.columns)

Cleaned dataset head:
   label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Columns after renaming:
 Index(['label', 'message'], dtype='object')


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

# Encoding the labels: 'ham' as 0 and 'spam' as 1
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Preprocessing the text: Convert to lowercase and remove unwanted characters
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data['message'] = data['message'].apply(preprocess_text)

# Splitting the data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

# Displaying the first few rows of the processed training data
print("Sample preprocessed training messages:\n", X_train.head())
print("\nLabels:\n", y_train.head())

Sample preprocessed training messages:
 1978    no i m in the same boat still here at my moms ...
3989    bank of granite issues strong buy explosive pi...
3935        they r giving a second chance to rahul dengra
4078              o i played smash bros lt gt religiously
4086    private your 2003 account statement for 079737...
Name: message, dtype: object

Labels:
 1978    0
3989    1
3935    0
4078    0
4086    1
Name: label, dtype: int64


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000)  # Limiting to 3000 features to reduce dimensionality
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Displaying the shape of the resulting vectors to confirm transformation
print("Shape of training data vectors:", X_train_vec.shape)
print("Shape of test data vectors:", X_test_vec.shape)

# Saving the vectorizer for later use
import joblib
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")

Shape of training data vectors: (4457, 3000)
Shape of test data vectors: (1115, 3000)


['tfidf_vectorizer.joblib']

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initializing the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence issues arise

# Training the model
model.fit(X_train_vec, y_train)

# Predicting on the test set
y_pred = model.predict(X_test_vec)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Logistic Regression model:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Saving the trained model for later use
joblib.dump(model, "phishing_text_classifier.joblib")

Accuracy of the Logistic Regression model: 0.968609865470852

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.99      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
 [[964   1]
 [ 34 116]]


['phishing_text_classifier.joblib']