In [1]:
# Imports necessary libraries for data manipulation, machine learning, and evaluation
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Reads the dataset from a CSV file into a pandas DataFrame
df = pd.read_csv('spam_ham_dataset.csv')

# Assigns the text content to the features variable 'X'
X = df['text']

# Assigns the numerical labels (0 for ham, 1 for spam) to the target variable 'y'
y = df['label_num']

# Initializes a TF-IDF Vectorizer to convert text data into a numerical format
# It is configured to consider a maximum of 40 features (words)
tuff = TfidfVectorizer(max_features=40)

# Fits the vectorizer to the text data and transforms the text into a matrix of TF-IDF features
X_vectorized = tuff.fit_transform(X)

# Splits the vectorized data into training and testing sets.
# 80% of the data is used for training, and 20% is used for testing.
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2)

# Initializes the Decision Tree Classifier model
model = DecisionTreeClassifier()

# Trains the model using the training data
model.fit(X_train, y_train)

# Makes predictions on the test data
y_pred = model.predict(X_test)

# Calculates the accuracy of the model by comparing the predicted labels with the true labels
# The result is printed to the console.
print(accuracy_score(y_pred, y_test))

array([0])

In [17]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
