<a href="https://colab.research.google.com/github/vikishan13/Email-spam-detection-using-Transfer-Learning/blob/main/E_mail_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
import pandas as pd
try:
    # Attempt to read with UTF-8 encoding
    data = pd.read_csv("/content/spam.csv")
except UnicodeDecodeError:
    # If UTF-8 fails, try 'latin-1' encoding
    data = pd.read_csv("/content/spam.csv", encoding='latin-1')

In [3]:
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
drop_columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
data.drop(drop_columns, axis = 1, inplace = True)

In [5]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
new_names = {'v1' : 'label', 'v2': 'text'}
data.rename(columns = new_names, inplace = True)

In [7]:
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
data['text']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [9]:
data['label']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [10]:
# let's encode the label column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['label'] = encoder.fit_transform(data['label'])

In [11]:
data

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [12]:
X = data['text'].to_list()
y = data['label']

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size  = 0.2, random_state = 42)

In [14]:
import tensorflow_hub as hub

In [15]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [16]:
xtrain = embed(xtrain)
xtest = embed(xtest)

In [17]:
xtrain

<tf.Tensor: shape=(4457, 512), dtype=float32, numpy=
array([[-0.02236211, -0.00179335,  0.04373107, ...,  0.00971716,
        -0.02651805, -0.01898406],
       [-0.03164499, -0.06666242, -0.0307229 , ..., -0.0243355 ,
        -0.05803042,  0.00491448],
       [ 0.00368232, -0.06883072, -0.07631648, ..., -0.05066898,
        -0.07880119, -0.07119669],
       ...,
       [-0.01383996, -0.02353746, -0.00426729, ..., -0.02782197,
        -0.03860996, -0.0537454 ],
       [ 0.00375236,  0.08156195, -0.00991514, ..., -0.04874335,
        -0.02668169, -0.00636178],
       [-0.01688972, -0.02840749,  0.07221662, ...,  0.00181214,
        -0.05104123,  0.03195838]], dtype=float32)>

In [18]:
xtest

<tf.Tensor: shape=(1115, 512), dtype=float32, numpy=
array([[ 0.04360111, -0.06851714, -0.05270006, ..., -0.03659743,
        -0.0447177 , -0.0639959 ],
       [ 0.01490014, -0.08595566, -0.01988588, ...,  0.01164868,
        -0.02205576, -0.02266758],
       [ 0.0539706 , -0.02322666, -0.05053682, ...,  0.04307747,
        -0.042799  , -0.03697591],
       ...,
       [-0.02263384, -0.03707375,  0.01714083, ..., -0.03907248,
        -0.02529422, -0.04534036],
       [ 0.00608291, -0.04348052, -0.04518567, ...,  0.07498675,
         0.09830128,  0.00835056],
       [-0.0510178 , -0.03854123,  0.03688184, ..., -0.04857623,
         0.01716468, -0.03620029]], dtype=float32)>

In [19]:
xtrain[1].shape

TensorShape([512])

In [20]:
model = models.Sequential()

In [21]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2

In [22]:
model.add(Dense(64, activation = 'relu', input_shape = (512, ), kernel_regularizer=l2(0.01)))

In [23]:
model.add(Dense(1, activation = 'sigmoid'))

In [24]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [25]:
xtrain.shape

TensorShape([4457, 512])

In [26]:
4457/32

139.28125

In [27]:
model.fit(xtrain, ytrain, epochs = 20, batch_size = 32, validation_data = (xtest, ytest))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7b7d00336b00>

In [28]:
predictions = model.predict(xtest)
predictions = (predictions > 0.5).astype(int)



In [29]:
accuracy_score(predictions, ytest)

0.9811659192825112

In [30]:
classification_report(predictions, ytest)

'              precision    recall  f1-score   support\n\n           0       0.99      0.98      0.99       976\n           1       0.89      0.96      0.93       139\n\n    accuracy                           0.98      1115\n   macro avg       0.94      0.97      0.96      1115\nweighted avg       0.98      0.98      0.98      1115\n'

In [31]:
confusion_matrix(predictions, ytest)

array([[960,  16],
       [  5, 134]])

In [32]:
user_input = input("Enter the text: ")

Enter the text: Get 50% discount on items for this pongal festival, offer is soon to be expired. Get the product quickly


In [33]:
# Convert the input text to embeddings using Universal Sentence Encoder
user_embedding = embed([user_input])

# Make a prediction using the trained model
prediction = model.predict(user_embedding)

# Convert the prediction to a binary value (0 or 1)
prediction_label = 1 if prediction[0][0] > 0.5 else 0

# Display the prediction
if prediction_label == 1:
    print("The input is predicted as spam.")
else:
    print("The input is predicted as not spam.")

The input is predicted as spam.


In [34]:
user_input = input("Enter the text: ")

Enter the text: Hi Mom and family, I hope everyone is fine. Will meet u guys in Pongal festival


In [35]:
# Convert the input text to embeddings using Universal Sentence Encoder
user_embedding = embed([user_input])

# Make a prediction using the trained model
prediction = model.predict(user_embedding)

# Convert the prediction to a binary value (0 or 1)
prediction_label = 1 if prediction[0][0] > 0.5 else 0

# Display the prediction
if prediction_label == 1:
    print("The input is predicted as spam.")
else:
    print("The input is predicted as not spam.")

The input is predicted as not spam.
