<a href="https://colab.research.google.com/github/tonysarre/extract/blob/main/FutureIntern_DS_05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [23]:
# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/Future Intern/spam.csv", encoding='latin-1')

# Keep only the first two columns
data = data[['v1', 'v2']]

# Rename the columns to 'label' and 'message'
data.columns = ['label', 'message']

# Display the first few rows to confirm
print(data.head())

# Check for NaN values in the dataset
print("Number of missing values in the dataset:")
print(data.isna().sum())

# Drop rows where there are NaN values in either 'label' or 'message' column
data = data.dropna()

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Number of missing values in the dataset:
label      0
message    0
dtype: int64


In [24]:
# Encode the 'label' column (ham = 0, spam = 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Ensure 'message' column contains string data
data['message'] = data['message'].astype(str)



In [25]:
# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)


In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display the evaluation results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9668
Precision: 1.0000
Recall: 0.7533
F1 Score: 0.8593
