In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("Phishing_Email.csv", encoding='latin-1')

df

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email
...,...,...,...
18645,18646,date a lonely housewife always wanted to date ...,Phishing Email
18646,18647,request submitted : access request for anita ....,Safe Email
18647,18648,"re : important - prc mtg hi dorn & john , as y...",Safe Email
18648,18649,press clippings - letter on californian utilit...,Safe Email


In [2]:
df = df.drop(columns=["Unnamed: 0"])
df = df.rename(columns={"Email Text": "texto"})
print(df["Email Type"].value_counts())
print("Total: ", df["Email Type"].value_counts().sum())

Email Type
Safe Email        11322
Phishing Email     7328
Name: count, dtype: int64
Total:  18650


In [3]:
df["texto"].replace('empty',np.nan,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["texto"].replace('empty',np.nan,inplace=True)


In [4]:
df = df.dropna()
print(df.value_counts().sum())

18101


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["Email Type"] = le.fit_transform(df["Email Type"])

In [None]:
import unicodedata
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return ''.join([c for c in nfkd_form if not unicodedata.category(c) == 'Mn'])

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in ENGLISH_STOP_WORDS])

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = remove_accents(text)
    text = remove_stopwords(text)
    return text.strip()

df["texto"] = df["texto"].apply(preprocess_text)

print(df.head())


                                               texto  Email Type
0  6 1100 disc uniformitarianism 1086 sex lang di...           1
1  galicismos galicismo spanish term names improp...           1
2  equistar deal tickets available assist robert ...           1
3  hello hot lil horny toy dream open minded pers...           0
4  software incredibly low prices 86 lower draper...           0


In [None]:
from gensim.models import Word2Vec

def word2vec(textos_tokenizados, vector_size=200, window=6, min_count=2):
    model = Word2Vec(
        sentences=textos_tokenizados,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        workers=4
    )
    return model

def vetor_medio(texto, model):
    palavras = texto.split()
    vetores = [model.wv[p] for p in palavras if p in model.wv]
    if vetores:
        return np.mean(vetores, axis=0)
    else:
        return np.zeros(model.vector_size)

def word2vec_transform(df, model):
    vetores = df["texto"].apply(lambda x: vetor_medio(x, model))
    matriz = np.vstack(vetores.values)

    df_w2v = pd.DataFrame(matriz, columns=[f"w2v_{i}" for i in range(model.vector_size)])
    df_w2v["Email Type"] = df["Email Type"].values

    return df_w2v, model

df["tokens"] = df["texto"].apply(lambda x: x.lower().split())

modelo_w2v = word2vec(df["tokens"].tolist())

df_final, modelo_w2v = word2vec_transform(df, modelo_w2v)

print(df_final.head())


      w2v_0     w2v_1     w2v_2     w2v_3     w2v_4     w2v_5     w2v_6  \
0  0.153812 -0.061595 -0.118919  0.169105  0.358679 -0.100350 -0.054256   
1 -0.006515 -0.050960 -0.070281  0.185318  0.282403 -0.066567  0.101860   
2  0.114173 -0.171148 -0.259708  0.305689  0.367572  0.176437 -0.158071   
3  0.095020 -0.041170 -0.151057  0.230449  0.314425 -0.125564 -0.175882   
4  0.137910 -0.093879 -0.137078  0.099013  0.292562 -0.084732 -0.102193   

      w2v_7     w2v_8     w2v_9  ...   w2v_191   w2v_192   w2v_193   w2v_194  \
0  0.397640 -0.031036  0.072123  ... -0.262040  0.041747 -0.169848  0.055648   
1  0.338936 -0.093375  0.109206  ... -0.170371 -0.095704 -0.144121  0.068030   
2  0.441043  0.059579  0.590908  ... -0.283245  0.103755 -0.222808  0.395738   
3  0.313436  0.010741  0.338988  ... -0.193292  0.094837 -0.250937  0.043706   
4  0.467764 -0.032607  0.178662  ... -0.246156  0.001393 -0.153404  0.148858   

    w2v_195   w2v_196   w2v_197   w2v_198   w2v_199  Email Type  
0 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score


X = df_final.drop(columns=["Email Type"])
y = df_final["Email Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)

params = {
    'max_depth': [10, 15],
    'min_samples_split': [15, 20],
    'min_samples_leaf': [5, 10],
    'ccp_alpha': np.linspace(0, 0.1)
}

grid = GridSearchCV(
    DecisionTreeClassifier(random_state=77),
    param_grid=params,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)

y_pred = grid.best_estimator_.predict(X_test)
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Best Params: {'ccp_alpha': 0.0, 'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 15}
Best Score: 0.92866036817525
Accuracy on Test Set: 0.9328914664457332
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1381
           1       0.95      0.94      0.95      2240

    accuracy                           0.93      3621
   macro avg       0.93      0.93      0.93      3621
weighted avg       0.93      0.93      0.93      3621



In [15]:
novo_email = {
    "texto": """
Subject: Your Account Has Been Flagged for Security Verification

Dear User,

We have noticed suspicious sign-in attempts to your Microsoft account. For your protection, please verify your identity by clicking the secure link below:

[Secure Your Account]

Failure to verify within 12 hours will result in account restriction.

Microsoft Security Team
 """
}

df_novo = pd.DataFrame([novo_email])

df_novo["texto"] = df_novo["texto"].apply(lambda x: x.lower().split())

X_novo = np.array([vetor_medio(" ".join(df_novo["texto"].iloc[0]), modelo_w2v)])

X_novo_df = pd.DataFrame(X_novo, columns=[f"w2v_{i}" for i in range(X_novo.shape[1])])


# Predição do modelo
y_pred_novo = grid.best_estimator_.predict(X_novo_df)
if y_pred_novo[0] == 0:
  print("Classe prevista: Phishing")
else:
  print("Classe prevista: Não Phishing")




Classe prevista: Phishing


In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
from keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


X = df_final.drop(columns=["Email Type"])
y = df_final["Email Type"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=777)

input_dim = X_train.shape[1]

model = Sequential()
model.add(Dense(32, input_dim=input_dim, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))  # Output binário (Phishing ou Não)

optimizer = optimizers.Adam(learning_rate=0.01)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

print(model.summary())

history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, verbose=1)

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=[str(c) for c in label_encoder.classes_]))



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8714 - loss: 0.2852 - val_accuracy: 0.9582 - val_loss: 0.1069
Epoch 2/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9662 - loss: 0.1039 - val_accuracy: 0.9568 - val_loss: 0.1184
Epoch 3/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9717 - loss: 0.0870 - val_accuracy: 0.9675 - val_loss: 0.0824
Epoch 4/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9776 - loss: 0.0647 - val_accuracy: 0.9696 - val_loss: 0.0771
Epoch 5/5
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9778 - loss: 0.0661 - val_accuracy: 0.9658 - val_loss: 0.0862
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy on Test Set: 0.9726594863297432
              precision    recall  f1-score   support

           0       0.97      

In [24]:
novo_email = {
    "texto": """ Action Required: Delivery Held Due to Unpaid Fee

Hello,

Your package is being held due to an unpaid delivery fee. To release your shipment, please complete the payment at the link below:

[Complete Payment]

Failure to act will result in return to sender.

Thank you,  
Fast Delivery Logistics
    """
}

df_novo = pd.DataFrame([novo_email])

df_novo["texto"] = df_novo["texto"].apply(preprocess_text)

# transformação w2c
vetor_novo = vetor_medio(df_novo["texto"].iloc[0], modelo_w2v)  
X_novo = np.array([vetor_novo])  

y_pred_prob = model.predict(X_novo)
y_pred = (y_pred_prob > 0.5).astype(int) 

if y_pred[0][0] == 0:
    print("Classe prevista: Phishing")
else:
    print("Classe prevista: Não Phishing")

print("Probabilidade de ser Não Phishing:", y_pred_prob[0][0])
print("Probabilidade de ser Phishing:", 1 - y_pred_prob[0][0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
Classe prevista: Phishing
Probabilidade de ser Não Phishing: 0.02271071
Probabilidade de ser Phishing: 0.9772892892360687


In [26]:
# k-fold + gridsearchcv
from sklearn.model_selection import KFold, cross_val_score
