In [1]:
import glob
import pandas as pd
import email
from email import policy
from src.parser.MailTextParser import parse_mime_tree
from src.parser.MailSubjectParser import parse_mime_subject
from src.parser.MailSenderParser import parse_mime_sender
from typing import Dict, List, Optional, Tuple, Union
from collections import Counter
import re
import spacy

## Übersicht

Dies ist ein Notebook, das die Beiträge auf einer Mailing Liste (Yggdrasill) strukturiert lädt und auswertet.

Folgende Schritte sind derzeit integriert:

### Laden und Transformation

Todo.

### Analyse

1. Betreffszeilen-Analyse
2. Autoren-Analyse
3. Text-Analyse
4. Beitragshäufigkeiten-Analyse

### Visualisierungen

1. Beiträge/Jahr
2. Beiträge/Monat (in einem bestimmten Jahr)

## Hilfsfunktionen


In [2]:
MONTH_MAPPING = {
    "01": "JAN",
    "02": "FEB",
    "03": "MAR",
    "04": "APR",
    "05": "MAI",
    "06": "JUN",
    "07": "JUL",
    "08": "AUG",
    "09": "SEP",
    "10": "OCT",
    "11": "NOV",
    "12": "DEC"
}

In [3]:
def guess_encoding(file_path):
    encodings = ['utf-8', 'iso-8859-1', 'windows-1252']  # Add more as needed
    for enc in encodings:
        try:
            with open(file_path, encoding=enc) as file:
                content = file.read()
                if content == None:
                    print("NONE")
                return enc
        except UnicodeDecodeError:
            continue
    print(f"Could not open {file_path}. Maybe wrong encoding?")
    return None

## Hauptfunktionen

In [4]:
def get_emails_from_folder(year_range: Tuple[int,int]) -> Dict[int, Dict[str, Dict[int, email.message.Message]]]:
    '''Get all emails from email folders stored under data/.

        Parameters
        ----------
            year_range: Tuple(int,int)
                The year range, where starting year is _inclusive_ and end year is _exclusive_.

        Returns
        -------
            (Dict) 
            
            {
                YEAR: {
                    MONTH: {
                        1: <email.message.Message>
                    }
                    ...
                }
            }
    '''
    emails = dict()
    counter = 0
    for year in range(year_range[0],year_range[1]):
        emails[year] = dict()
        for month in MONTH_MAPPING:
            emails[year][month] = dict()
            for idx, file in enumerate(glob.glob(f"data/{year}/yggdrasill_{year}-{month}/*")):
                # guess encoding first, since there are quite a few non-UTF-8 mails
                enc = guess_encoding(file)
                with open(file, "r", encoding=enc) as f:
                    emails[year][month][idx] = email.message_from_file(f, policy=policy.default) # the policy-default is very important for subject titles
                    # check if message has been parsed correctly
                    if emails[year][month][idx].keys() == []:
                        print(f"Issue with mail!")
                        print(file)                            
                counter += 1
    print(f"Received {counter} emails from {year_range[0]} to {year_range[1]-1}.") 
    return emails

In [5]:
#emails_all = get_emails_from_folder((1997,2024))

In [6]:
#emails_2023 = get_emails_from_folder((2023,2024))

## Speichern & Laden der E-Mails

In [7]:
### save 
import pickle

#with open("storage_all.pkl", "wb") as f:
 #   pickle.dump(emails_all, f)

#with open("storage_2023.pkl", "wb") as f:
 #   pickle.dump(emails_2023, f)

In [8]:
# open
import pickle

with open("storage_all.pkl", "rb") as f:
    emails_all = pickle.load(f)

with open("storage_2023.pkl", "rb") as f:
    emails_2023 = pickle.load(f)

## Datenerzeugung aus E-Mails

In [9]:
def get_content_from_emails(email_dict: Dict[int, Dict[str, Dict[int, email.message.Message]]]) -> Dict:
    '''Extract CONTENT, SUBJECT, SENDER from emails retrieved via get_emails_from_folder().

        Parameters
        ----------
            email_dict: Dict[str, Dict[str, Dict[int, email.message.Message]]
                The dictionary with emails, retrieved using get_emails_from_folder().

        Returns
        -------
            (Dict) 
            
            {
                YEAR: {
                    MONTH: {
                        1: {
                            "subject": "<SUBJECT>",
                            "text": "<TEXT>",
                            "sender": "<SENDER>"
                        }
                        ...
                        "counter": <INT>
                    }
                    ...
                    "counter": <INT>
                }
                "counter": <INT>
            }
    '''
    emails = dict()
    counter = 0
    for year in email_dict:
        counter_year = 0
        emails[year] = dict()
        for month in email_dict[year]:
            counter_month = 0
            emails[year][month] = dict()
            for idx, mail in email_dict[year][month].items():
                text = parse_mime_tree(mail)
                sender = parse_mime_sender(mail)
                subject = parse_mime_subject(mail)
                emails[year][month][idx] = {
                    "text": text,
                    "subject": subject,
                    "sender": sender
                }
                # incrementing counters
                counter += 1
                counter_year += 1
                counter_month += 1
            emails[year][month]["counter"] = counter_month
        emails[year]["counter"] = counter_year
    emails["counter"] = counter
    return emails

In [10]:
content = get_content_from_emails(emails_all)

  return BeautifulSoup(content, "html.parser").get_text() # removing html tags and noise
  return BeautifulSoup(content, "html.parser").get_text() # removing html tags and noise


In [11]:
content_2023 = get_content_from_emails(emails_2023)

## Analyse

### Betreffszeilen-Analyse

In [12]:
def clean_subject(subject: str) -> str:
    '''Function to clean an email-subject using RE.
    '''
    CLEANING_PATTERN = re.compile(r"^re:\s+\[ygg\]|^fwd:\s+\[ygg\]|^aw:\s+\[ygg\]|^re:\s+fwd:\s+\[ygg\]|^\[ygg\]|\baw:|\bfwd:|\bfw:|\bre:|\[dolmen\]|\[candide\]", re.IGNORECASE)
    subject = re.sub(CLEANING_PATTERN, " ", subject)
    subject = re.sub(r"\s{2,}", " ", subject)
    subject = subject.strip().lower()
    return subject

In [13]:
## get titles related to conferences and cfps

def get_specific_subjects(emails_in, pattern) -> Dict[int, Dict[str,Union[List,int]]]:
    '''Function to count and collect all subjects matching a RE pattern.

        Parameters
        ----------
            emails_in: Dict[]
                The nested content dict from get_content_from_emails().
            pattern: re
                The RE to match.

        Returns
        -------
            Dictionary:
            {
                <YEAR>: {
                    "titles": List[str],
                    "counter": int
                }
                counter = int
            }
    '''
    cfp_dict = dict()
    # create general counter for all years
    cfp_dict["counter"] = 0
    for year,months in emails_in.items():
        if type(year) == int:
            cfp_dict[year] = {
                "titles": list(),
                "counter": 0
            }
            for month,messages in months.items():
                if month != "counter":
                    for _,msg in messages.items():
                        if _ != "counter":
                            cln_subject = clean_subject(msg["subject"]) if msg["subject"] else ""
                            if re.match(pattern, cln_subject):
                                cfp_dict[year]["titles"].append(msg["subject"])
                                cfp_dict[year]["counter"] += 1
                                cfp_dict["counter"] += 1

    return cfp_dict
                    
                

In [14]:
## get titles related to conferences and cfps

def get_top_subjects(emails_in) -> Counter:
    '''Function to count and collect all subjects.

        Parameters
        ----------
            emails_in: Dict[]
                The nested content dict from get_content_from_emails().

        Returns
        -------
            Counter: Counter() object with frequency of cleaned subjects.
    '''
    counter = Counter()
    for year,months in emails_in.items():
        if type(year) == int:
            for month,messages in months.items():
                if month != "counter":
                    for _,msg in messages.items():
                        if _ != "counter":
                            cln_subject = clean_subject(msg["subject"]) if msg["subject"] else ""
                            counter[cln_subject] += 1

    # remove empty subject lines
    if "" in counter:
        del counter[""]
    return counter
                    
                

In [15]:
CFP_RE_PATTERN = re.compile(r"cfa|cfp|konferenz|conference|call for paper(s)?|tagung")
ANNOUNCEMENTS_RE_PATTERN = re.compile(r"ankündigung|publikation|publication|bekanntmachung|announcement")

# add content dict to analyze here
CONTENT = content_2023

# calc stats for topics
cfp_subjects_dict = get_specific_subjects(CONTENT, CFP_RE_PATTERN)
ann_subjects_dict = get_specific_subjects(CONTENT, ANNOUNCEMENTS_RE_PATTERN)
top_subjects = get_top_subjects(CONTENT)
print(f"Von {CONTENT['counter']:,} Nachrichten waren {cfp_subjects_dict['counter']} ({(cfp_subjects_dict['counter']/ CONTENT['counter'])*100:.2f}%) Call for Papers und {ann_subjects_dict['counter']} ({(ann_subjects_dict['counter']/ CONTENT['counter'])*100:.2f}%) Ankündigungen (Bücher, Veranstaltungen usw.).")

Von 647 Nachrichten waren 36 (5.56%) Call for Papers und 12 (1.85%) Ankündigungen (Bücher, Veranstaltungen usw.).


In [16]:
top_subjects.most_common(10)

[('was? schon drei nicht-theologische professuren für religionswissenschaft in deutschland?',
  32),
 ('religionswissenschaftliche lehre und ki', 30),
 ('déjà-vu: zeugen jehovas und sektendebatte', 19),
 ('religiös motivierter terrorismus?', 11),
 ('neueste erkenntnisse aus der religionswissenschaft', 11),
 ('ditib (korrektur)', 10),
 ('boga njet! ausstellung in leipzig', 9),
 ('king charles iii', 9),
 ('helmuth von glasenapp', 9),
 ('maßstäbe außerhalb der wissenschaftlichkeit - ehem.: was? schon drei nicht-theologische professuren für religionswissenschaft in deutschland?',
  7)]

### Text-Analyse
Dieser Teil konzentriert sich auf die Analyse des Nachrichteninhalts einer Auswahl von Nachrichten. Mit der Methode `get_messages_from_specific_subjects()` kann erst eine Auswahl an E-Mails basierend auf einem RE.match der Überschriften erstellt werden. Die Daten für die ausgewählten Mails umfassen:

1. Eine lemmatisierte Version des Textes (in Kleinbuchstaben gesetzt, nur alphabetische Wörter).
2. Eine Liste mit NEs (*named entitites*) und den NE-Kategorien, die in den Nachrichten vorkommen.

Die Analyse besteht in einem BOW der lemmatisierten Nachrichten via `bow()` und einer Auswertung der NEs via `ner()`.

Leider sind diese Analysen noch sehr schwierig auszuwerten. So kommen viele Orte, Namen und Phrasen ("Wenn Sie keine Mails mehr erhalten möchten...", "Yggdrasill" etc.) nur so häufig vor, weil sich der Inhalt von E-Mails durch Zitate in anderen E-Mails wiederholt oder weil Orte bzw. Institutionen wie "Universität Graz" von beteiligten Personen in einer Signatur unter den Beiträgen verwendet werden. Diese sagen aber noch nichts über das eigentliche Thema aus.


In [37]:
def clean_text(text: str, nlp) -> Tuple[str, List[str]]:
    '''Function to clean text (message from email) using spaCy and more. Also returns lists of NER.

        Parameters
        ----------
            text: in
                The message from the email.
            nlp: spaCy pipeline
                The spaCy pipeline for the text processing.

        Returns
        -------
            Clean text string.
    '''
    text = nlp(text)
    # lemmata
    lemmata = [token.lemma_ for token in text if token.is_alpha and not token.is_stop]
    lemmata = [lemma.lower() for lemma in lemmata]
    # ners
    ner = [(token.text.lower(), token.label_) for token in text.ents]
    return (" ".join(lemmata), ner)

In [51]:
def get_messages_from_specific_subjects(emails_in, pattern) -> Dict[int, Dict[str,Union[List,int]]]:
    '''Function to count and collect all messages from emails with subjects matching a RE pattern.

        Parameters
        ----------
            emails_in: Dict[]
                The nested content dict from get_content_from_emails().
            pattern: re
                The RE to match.

        Returns
        -------
            Dictionary:
            {
                <YEAR>: {
                    "messages": List[str],
                    "counter": int
                    "ner": List[Tuple[str,str]]
                }

                counter: int
            }
    '''
    cfp_dict = dict()
    # create general counter for all years
    cfp_dict["counter"] = 0
    # create spaCy pipeline to pass to clean_text() later
    nlp = spacy.load("de_core_news_md")
    for year,months in emails_in.items():
        if type(year) == int:
            cfp_dict[year] = {
                "messages": list(),
                "counter": 0,
                "ner": list()
            }
            for month,messages in months.items():
                if month != "counter":
                    for _,msg in messages.items():
                        if _ != "counter":
                            cln_subject = clean_subject(msg["subject"]) if msg["subject"] else ""
                            if re.match(pattern, cln_subject):
                                # only get clean text
                                text, ner = clean_text(msg["text"], nlp)
                                cfp_dict[year]["messages"].append(text)
                                cfp_dict[year]["ner"].extend(ner)
                                cfp_dict[year]["counter"] += 1
                                cfp_dict["counter"] += 1

    return cfp_dict      

In [52]:
def bow(messages_dict: dict) -> Counter:
    '''Function to create a simple bag-of-words Counter() from a dictionary with messages from emails.
    '''
    cnt = Counter()
    for year,data in messages_dict.items():
        if type(year) == int:
            for k,v in data.items():
                if k == "messages":
                    for text in v:
                        cnt.update(text.split())
    return cnt
    

In [58]:
def ner_analysis(messages_dict: dict) -> Tuple[Counter]:
    '''Function to analyze list of ners in dictionary created with get_messages_from_specific_subjects().
    '''
    cnt_text = Counter()
    cnt_label = Counter()
    for year,data in messages_dict.items():
        if type(year) == int:
            for k,v in data.items():
                if k == "ner":
                    for tpl in v:
                        cnt_text.update([tpl[0]])
                        cnt_label.update([tpl[1]])
    return (cnt_text, cnt_label)

In [54]:
# get specific subject
spec_sub_dict = get_messages_from_specific_subjects(CONTENT, re.compile(r"religionswissenschaftliche lehre und ki"))

In [55]:
#bow(spec_sub_dict)

In [60]:
#ner_analysis(spec_sub_dict)

### Autoren-Analyse

In [None]:
def clean_sender(sender: str) -> str:
    '''Function to clean an email-from field using RE.
    '''
    SEARCH_PATTERN = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", re.IGNORECASE)
    sender_clean = re.findall(SEARCH_PATTERN, sender)
    sender_clean = sender_clean[0].strip().lower() if len(sender_clean) > 0 else ""
    if sender_clean == "yggdrasill@lists.uni-marburg.de":
        # if the sender was the mailing list, try and find the name of the sender instead of email
        sender_clean = re.findall(r'^.*?[(<]', sender)
        sender_clean = sender_clean[0].strip().lower() if len(sender_clean) > 0 else ""
        sender_clean = re.sub(r"\W+", " ", sender_clean)
        sender_clean = re.sub(r"\s{2,}", " ", sender_clean)
    return sender_clean.strip()

In [None]:
def get_top_sender(emails_in) -> Counter:
    '''Function to count and collect sender.

        Parameters
        ----------
            emails_in: Dict[]
                The nested content dict from get_content_from_emails().

        Returns
        -------
            Counter: Counter() object with frequency of cleaned sender data.
    '''
    counter = Counter()
    for year,months in emails_in.items():
        if type(year) == int:
            for month,messages in months.items():
                if month != "counter":
                    for _,msg in messages.items():
                        if _ != "counter":
                            cln_sender = clean_sender(msg["sender"]) if msg["sender"] else ""
                            counter[cln_sender] += 1
    return counter
                    
                

In [None]:
sender_counter = get_top_sender(content_2023)

In [None]:
sender_counter.most_common(10)

In [61]:
# TODO: via OpenAI API hier eine Klassifizierung durchführen, ob es sich um Männer/Frauen handelt?

### Visualisierungen

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Create a line plot with enhanced styling
sns.set_style("whitegrid")  # Set the background style
sns.set_palette("viridis")  # Set the color palette

def plot_posts_per_year(emails):
    '''Function to plot the number of posts per year.
    
    '''
    dict_ = dict()
    for year,data in emails.items():
        if type(year) == int:
            dict_[str(year)] = data["counter"]
    df = pd.DataFrame(dict_.items(), columns=["Year", "Posts"])
    sns.lineplot(x=df["Year"], y=df["Posts"], marker='o', markersize=8, linestyle='-', linewidth=2)
    plt.title(f"Posts per Year", fontsize=16)
    plt.xlabel("Year", fontsize=13)
    plt.xticks(rotation=45)
    plt.ylabel("Posts", fontsize=13)
    plt.tight_layout()
    plt.savefig(f"yearly_stats.png")
    

In [None]:
plot_posts_per_year(content)

In [None]:
def plot_posts_in_year(emails, year: int):
    '''Function to plot the number of posts/month of a particular year.
    
    '''
    dict_ = dict()
    for month,data in emails[year].items():
        if month != "counter":
            dict_[MONTH_MAPPING[month]] = data["counter"]
    df = pd.DataFrame(dict_.items(), columns=["Month", "Posts"])
    sns.lineplot(x=df["Month"], y=df["Posts"], marker='o', markersize=8, linestyle='-', linewidth=2)
    plt.title(f"Posts in {year}", fontsize=16)
    plt.xlabel("Month", fontsize=13)
    plt.xticks(rotation=45)
    plt.ylabel("Posts", fontsize=13)
    plt.tight_layout()
    plt.savefig(f"posts_in_{year}.png")
    

In [None]:
plot_posts_in_year(content,2023)