<a href="https://colab.research.google.com/github/vijayvenkatj/Ai-project/blob/main/python_proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Package installations**

In [5]:
!pip install --quiet scikit-learn torch
!pip install tensorflow scikit-learn pandas numpy




# **Loading Dataset**

In [6]:
import pandas as pd

spam_url = "https://raw.githubusercontent.com/vijayvenkatj/Ai-project/main/spam.csv"
phishing_url = "https://raw.githubusercontent.com/vijayvenkatj/Ai-project/main/Phishing_Email.csv"


spam = pd.read_csv(spam_url,encoding="latin-1")
phishing = pd.read_csv(phishing_url,encoding="latin-1")

phishing.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


# **Preprocessing dataset**

In [7]:
spam = spam.rename(columns={'v1': 'label', 'v2': 'text'})
spam = spam[['label', 'text']]
spam['label'] = spam['label'].replace({'ham': 'safe', 'spam': 'spam'})

phishing = phishing.rename(columns={'Email Type': 'label', 'Email Text': 'text'})
phishing = phishing[['label', 'text']]
phishing['label'] = phishing['label'].replace({'Phishing Email': 'spam', 'Safe Email': 'safe'})

combined = pd.concat([spam, phishing], ignore_index=True)
combined.head()

Unnamed: 0,label,text
0,safe,"Go until jurong point, crazy.. Available only ..."
1,safe,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,safe,U dun say so early hor... U c already then say...
4,safe,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
import re

def clean_text(text):
    text = re.sub(r'\d+', ' NUM ', text)

    text = text.lower()
    text = re.sub(r'http\S+|www\S+', ' URL ', text)
    text = re.sub(r'\S+@\S+', ' EMAIL ', text)

    text = re.sub(r'[^a-zA-Z0-9@:/.\-_\s]', ' ', text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text


combined['text'] = combined['text'].fillna('').astype(str)
combined['text'] = combined['text'].apply(clean_text)

combined.head()


Unnamed: 0,label,text
0,safe,go until jurong point crazy.. available only i...
1,safe,ok lar... joking wif u oni...
2,spam,free entry in num a wkly comp to win fa cup fi...
3,safe,u dun say so early hor... u c already then say...
4,safe,nah i don t think he goes to usf he lives arou...


# **Tokenising the data**

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(combined['text']).toarray()


combined['label'] = combined['label'].map({'safe': 0, 'spam': 1})

# **Training stage**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, combined['label'], test_size=0.2, random_state=42
)


model = Sequential([
    Dense(256, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])


model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)


y_pred = (model.predict(X_test) > 0.5).astype("int32")

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m545/545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.8616 - loss: 0.2894 - val_accuracy: 0.9499 - val_loss: 0.1084
Epoch 2/5
[1m545/545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9737 - loss: 0.0654 - val_accuracy: 0.9634 - val_loss: 0.1024
Epoch 3/5
[1m545/545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 20ms/step - accuracy: 0.9840 - loss: 0.0362 - val_accuracy: 0.9613 - val_loss: 0.1246
Epoch 4/5
[1m545/545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - accuracy: 0.9879 - loss: 0.0258 - val_accuracy: 0.9587 - val_loss: 0.1595
Epoch 5/5
[1m545/545[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.9882 - loss: 0.0210 - val_accuracy: 0.9628 - val_loss: 0.1661
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3252
 

# **Testing Phase**

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import pandas as pd

# --- Predict on test set ---
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()  # Convert probabilities → 0 or 1

# --- Evaluate performance ---
print("✅ Model Evaluation on Test Data")
print("--------------------------------")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["safe", "spam"]))

# --- Confusion Matrix ---
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual Safe", "Actual Spam"], columns=["Predicted Safe", "Predicted Spam"])
print(cm_df)

# --- Optional: Show Sample Predictions ---
sample_idx = np.random.choice(len(X_test), 10, replace=False)
sample_preds = y_pred[sample_idx]
sample_actual = y_test.iloc[sample_idx].values

sample_df = pd.DataFrame({
    "Actual": ["safe" if a == 0 else "spam" for a in sample_actual],
    "Predicted": ["safe" if p == 0 else "spam" for p in sample_preds]
})
print("\n📊 Sample Predictions:\n", sample_df)


[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
✅ Model Evaluation on Test Data
--------------------------------
Accuracy: 0.9608

Classification Report:
               precision    recall  f1-score   support

        safe       0.97      0.97      0.97      3252
        spam       0.94      0.94      0.94      1593

    accuracy                           0.96      4845
   macro avg       0.96      0.95      0.96      4845
weighted avg       0.96      0.96      0.96      4845


Confusion Matrix:
             Predicted Safe  Predicted Spam
Actual Safe            3163              89
Actual Spam             101            1492

📊 Sample Predictions:
   Actual Predicted
0   safe      safe
1   spam      spam
2   safe      safe
3   safe      spam
4   safe      safe
5   spam      spam
6   safe      safe
7   spam      spam
8   safe      safe
9   spam      spam


In [15]:
# Example: custom email samples
custom_emails = [
    "Congratulations! You've won a $500 Amazon gift card. Click here to claim your reward!",
    "Meeting rescheduled to 3 PM. Please confirm your attendance.",
    "Urgent! Your bank account has been suspended. Click here to verify immediately.",
    "Lunch tomorrow? I’ll bring the documents.",
    "You’ve been selected for a limited-time offer. Visit our website now!"
]

# --- Clean and vectorize custom text (same process used for training) ---
custom_emails_clean = [clean_text(email) for email in custom_emails]
custom_X = vectorizer.transform(custom_emails_clean).toarray()  # ANN expects dense arrays

# --- Predict probabilities ---
custom_probs = model.predict(custom_X)
custom_preds = (custom_probs > 0.5).astype(int).flatten()  # Convert to 0/1 labels

# --- Map labels back ---
label_map = {0: 'safe', 1: 'spam'}
predicted_labels = [label_map[p] for p in custom_preds]

# --- Display results ---
print("\n🔍 Custom Email Predictions")
print("---------------------------------")
for email, label, prob in zip(custom_emails, predicted_labels, custom_probs):
    print(f"📧 Email: {email}\n➡️ Predicted Label: {label.upper()})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step

🔍 Custom Email Predictions
---------------------------------
📧 Email: Congratulations! You've won a $500 Amazon gift card. Click here to claim your reward!
➡️ Predicted Label: SPAM)

📧 Email: Meeting rescheduled to 3 PM. Please confirm your attendance.
➡️ Predicted Label: SAFE)

📧 Email: Urgent! Your bank account has been suspended. Click here to verify immediately.
➡️ Predicted Label: SPAM)

📧 Email: Lunch tomorrow? I’ll bring the documents.
➡️ Predicted Label: SAFE)

📧 Email: You’ve been selected for a limited-time offer. Visit our website now!
➡️ Predicted Label: SPAM)

