In [1]:
# Regex for URL-Detection
import re

In [2]:
# Read a text from a text file (in this case of an "email")
def read_text_from_file(file_path):
    # 'with open' assures that resources are closed
    with open(file_path, 'r', encoding='utf-8') as file:
        email_text = file.read()
    return email_text

# Count the number of URLs in a text with regex
def count_urls(email_text):
    urls = re.findall(r'(https?://\S+)', email_text)
    return len(urls), urls

# Count phishing-related keywords in a text
def count_phishing_keywords(email_text, keywords):
    email_text_lower = email_text.lower()
    # Dictionary for keyword-count
    keyword_count = {}
    
    for word in keywords:
        keyword_count[word] = email_text_lower.count(word)
    
    return keyword_count
    
# Calculate the length of a text (in characters)
def get_email_length(email_text):
    return len(email_text)

# Count special characters in a text
def count_special_chars(email_text, chars):
    special_chars_count = {char: email_text.count(char) for char in chars}
    return special_chars_count

In [3]:
file_path = '../data/intro_mailtext.txt'
phishing_keywords = ['passwort', 'account', 'konto', 'schützen', 'dringend']
special_chars = ['!', '$', '%', '=']

In [4]:
email_text = read_text_from_file(file_path)
print(f'\n### Email Text: {email_text}')

url_count, urls = count_urls(email_text)
print(f'\n### Number of URLs: {url_count}, URLs: {urls}')

keyword_count = count_phishing_keywords(email_text, phishing_keywords)
print(f'\n### Found keywords: {keyword_count}')

email_length = get_email_length(email_text)
print(f'\n### Length of the email: {email_length}')

special_chars_count = count_special_chars(email_text, special_chars)
print(f'\n### Found special characters: {special_chars_count}')


### Email Text: From: fake@phishingsite.com
To: victim@domain.com
Subject: Dringend: Ihr Konto wurde gesperrt!

Sehr geehrter Kunde,

Wir haben verdächtige Aktivitäten in Ihrem Konto festgestellt. Bitte klicken Sie auf den folgenden Link, um Ihr Passwort zu ändern und Ihr Konto zu schützen:
http://phishingsite.com/recovery
https://u46701922.ct.sendgrid.net/ls/click?upn=u001.bO3G-2FaeMIVe5lSdMnGjJWgTE6W-2FKKheAiBI-3D

Mit freundlichen Grüßen,
Ihr Sicherheitsteam

### Number of URLs: 2, URLs: ['http://phishingsite.com/recovery', 'https://u46701922.ct.sendgrid.net/ls/click?upn=u001.bO3G-2FaeMIVe5lSdMnGjJWgTE6W-2FKKheAiBI-3D']

### Found keywords: {'passwort': 1, 'account': 0, 'konto': 3, 'schützen': 1, 'dringend': 1}

### Length of the email: 449

### Found special characters: {'!': 1, '$': 0, '%': 0, '=': 1}
