In [None]:
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk 
import string

from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize

#nltk.download('punkt')
#nltk.download('punkt_tab')
#nltk.download('stopwords')

In [None]:
ps = PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('english'))

In [None]:
df = pl.read_csv("./datasets/Phishing_Email.csv")
df = df.with_columns([pl.col("Email Text").cast(pl.Utf8)])
df = df.fill_nan("empty").with_columns(pl.col("Email Text"))
df = df.fill_null("empty").with_columns(pl.col("Email Text"))

In [None]:
print(df)

In [None]:
def remove_links(text) -> str:
	pattern = re.compile('(http|https)://[^\s]+')
	return pattern.sub('', text)

In [None]:
def stemming_text(text: str) -> str:
    if not text:
        return "empty"
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = (word for word in word_tokenize(text) if word not in stopwords)
    stemmed = {ps.stem(word) for word in tokens}
    stemmed = list(filter(None, stemmed))
    if len(stemmed) == 1:
        return stemmed[0]
    if len(stemmed) == 0:
        return "empty"
    return " ".join([word for word in stemmed if len(word) > 2])

In [None]:
df = df.with_columns(pl.col("Email Text").map_elements(remove_links,return_dtype=pl.Utf8).alias("Email Text"))

In [None]:
df = df.with_columns(pl.col("Email Text").map_elements(stemming_text,return_dtype=pl.Utf8).alias("Email Text"))


In [None]:
print(df)

In [None]:
divid = int(len(df) * 0.7)
df_train = df[:divid]
df_test = df[divid:]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
clf = LogisticRegression()

x_train = df_train["Email Text"].to_list()
x_train = vectorizer.fit_transform(x_train)


In [None]:
import pandas as pd

pd.DataFrame(x_train.toarray(), columns=[vectorizer.get_feature_names_out()])

In [None]:
clf.fit(x_train, df_train["Email Type"])

In [None]:
x_test = df_test["Email Text"].to_list()
x_test = vectorizer.transform(x_test)


In [None]:
y_pred = clf.predict(x_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy: {:.3f}'.format(accuracy_score(df_test["Email Type"], y_pred)))   

In [None]:
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(df_test["Email Type"], y_pred)

fig, ax = plt.subplots(figsize=(8, 6))
cax = ax.matshow(cm, cmap=plt.cm.Blues)
fig.colorbar(cax)

ax.set_xticks(np.arange(len(cm)))
ax.set_yticks(np.arange(len(cm)))
ax.set_xticklabels(['Pishing', 'Safe Email'])
ax.set_yticklabels(['Pishing', 'Safe Email'])
plt.xlabel('Predicciones')
plt.ylabel('Valores Reales')
plt.title('Matriz de Confusión')

for (i, j), val in np.ndenumerate(cm):
    ax.text(j, i, val, ha='center', va='center', color='black', fontsize=12)

plt.show()

In [None]:
cont_err_safe = 0
cont_err_phishing = 0
for i in range(len(y_pred)):
    if y_pred[i] != df_test["Email Type"][i]:
        if y_pred[i] == "Safe Email":
            cont_err_safe += 1
        else:
            cont_err_phishing += 1

In [None]:
print(cont_err_safe)

In [None]:
safe_percent_err = (cont_err_safe / df_test.filter(pl.col("Email Type") == "Safe Email").height) * 100
phishing_percent_err = (cont_err_phishing / df_test.filter(pl.col("Email Type") == "Phishing Email").height) * 100
err_total = ((cont_err_safe + cont_err_phishing) / df_test.height) * 100 

print(safe_percent_err)
print(phishing_percent_err)
print(err_total)

In [None]:
plt.bar(['safe prediction error','pishing prediction error'], [safe_percent_err, phishing_percent_err], color=['cyan', 'cyan'])
plt.title('Porcentaje de error en predicciones')

# Probando Modelo con otro dataset

In [None]:
df_big_test = pl.read_csv("./datasets/phishing_email.csv")
df_big_test

In [None]:
df_big_test = df_big_test.with_columns(
	pl.when(pl.col("label") == 0)
	.then(pl.lit("Safe Email"))
	.otherwise(pl.lit("Phishing Email"))
	.alias("label")
)

In [None]:
print(df_big_test)
print(df)

In [None]:
#df_big_test = df_big_test.with_columns(pl.col('text_combined').map_elements(remove_links, return_dtype=pl.Utf8).alias("text_combined"))
#df_big_test = df_big_test.with_columns(pl.col("text_combined").map_elements(stemming_text, return_dtype=pl.Utf8).alias("text_combined"))

In [None]:
#df_big_test.write_csv("./datasets/phishing_email_cleaned.csv")

In [None]:
df_big_test = pl.read_csv("./datasets/phishing_email_cleaned.csv")

In [None]:
print(df_big_test)

In [None]:
x_big_test = df_big_test["text_combined"].to_list()
x_big_test = vectorizer.transform(x_big_test)

In [None]:
y_big_pred = clf.predict(x_big_test)
y_big_pred

In [None]:
print(accuracy_score(df_big_test["label"], y_big_pred))

In [None]:
cm2 = confusion_matrix(df_big_test["label"], y_big_pred)
cm2

In [None]:
cm2[1][0]

In [None]:
err_big_safe = (cm2[0][1]/df_big_test.filter(pl.col("label") == "Safe Email").height) * 100
err_big_phishing = (cm2[1][0]/df_big_test.filter(pl.col("label") == "Phishing Email").height) * 100
total_err_big = ((cm2[0][1] + cm2[1][0]) / df_big_test.height) * 100

In [None]:
plt.bar(['safe prediction error','pishing prediction error'], [err_big_safe,err_big_phishing ], color=['cyan', 'cyan'])

# Opiniones
Probando con muchos datos, al haber sido entrenado con pocos ejemplos a comparacion. unos 13 mil con los casi 90mil de test, dando un una precision de 88.1% podria decirse que es un modelo bueno, pero viendo que escalando el modelo, este se queda corto por todos los preprocesamientos de datos, ya que eliminan informacion, el primer dataset es uno muy limitado en cuanto a informacion. En cambio con el 2do dataset que tiene muchisima mas informacion como el destinatario como recibido de, correos, etc. Al ignorar esta informacion y con los preprocesamientos de datos pierde muchisima mas informacion igualando a una peor(1er dataset), esto tambien es una explicacion del porque dio tantos errores el 20% dando falsos positivos. El 2do dataset tiene mas correos "malos", si igualariamos la precision del modelo seria muchisimo menos.

# Probando igualando

In [None]:
safe_emails = df_big_test.filter(pl.col("label") == "Safe Email").head(39595)
phishing_emails = df_big_test.filter(pl.col("label") == "Phishing Email").head(39595)
df_big_test_equal = pl.concat([safe_emails, phishing_emails])
print(df_big_test_equal.shape)

In [None]:
x_big_test2 = df_big_test_equal["text_combined"].to_list()
x_big_test2 = vectorizer.transform(x_big_test2)

y_big_pred2 = clf.predict(x_big_test2)
y_big_pred2

In [None]:
print(accuracy_score(df_big_test_equal["label"], y_big_pred2))

In [None]:
cm3 = confusion_matrix(df_big_test_equal["label"], y_big_pred2)
cm3

In [None]:
err_big_safe2 = (cm3[0][1]/df_big_test_equal.filter(pl.col("label") == "Safe Email").height) * 100
err_big_phishing2 = (cm3[1][0]/df_big_test_equal.filter(pl.col("label") == "Phishing Email").height) * 100
total_err_big2 = ((cm3[0][1] + cm3[1][0]) / df_big_test_equal.height) * 100

print(err_big_safe2)
print(err_big_phishing2)
print(total_err_big2)

plt.bar(['safe prediction error','pishing prediction error'], [err_big_safe2,err_big_phishing2 ], color=['cyan', 'cyan'])