# Process Proceedings
This file reads the proceedings from the different years in the `data-proceedings` folder and processes them to extract the metadata, generating the `data-proceedings.csv` file, which is used in the subsequent steps.

Author: Sven Mayer (info@sven-mayer.com)

LICENSE: GPL-3.0 license 

In [1]:
import pandas as pd
import glob

def getEntryStart(entry):
    startMetaData = -1
    startAuthors = -1
    for j in range(len(entry)):
        if entry[j].startswith("### Meta-Data"):
            startMetaData = j + 1
        if entry[j].startswith("### Authors"):
            startAuthors = j + 1
    return startMetaData, startAuthors

def getYear(text):
    for i in range(len(text)):
        if text[i].startswith("# "):
            return text[i].split(" ")[-1].strip()
    return ""

def getAllEntries(text):
    lstStarts = []
    for i in range(len(text)):
        text[i] = text[i].replace("\n", "").strip()

        if text[i].startswith("## "):
            lstStarts.append(i)

    allEntries = []
    for i in range(len(lstStarts)):
        if i == len(lstStarts) - 1:
            entry = text[lstStarts[i]:]
        else:
            entry = text[lstStarts[i]:lstStarts[i+1]]
        allEntries.append(entry)
    return allEntries

def getTitles(entry):
    for i in range(len(entry)):
        if entry[i].startswith("# "):
            return entry[i].split("# ")[-1].strip()
    return ""

def parseFile(text):
    year = getYear(text)
    allEntries = getAllEntries(text)

    res = []
    for entry in allEntries:
        doi = ""
        entryType = ""
        entrySession = ""
        title = getTitles(entry)
        startMetaData, startAuthors = getEntryStart(entry)
        if (startMetaData == -1) or (startAuthors == -1):
            print("ERROR")
        else:
            for meta in entry[startMetaData:startAuthors]:
                if meta.startswith("* DOI: "):
                    doi = meta.split("* DOI: ")[-1]
                if meta.startswith("* Type: "):
                    entryType = meta.split("* Type: ")[-1]
                if meta.startswith("* Session: "):
                    entrySession = meta.split("* Session: ")[-1]
            for order, author in enumerate(entry[startAuthors:]):
                if (author.startswith("* ")):
                    parts = author[2:].split(" = ")
                    name = parts[0].strip()
                    affiliation = parts[1].strip()
                    alpha3 = parts[2].strip()
                    url = parts[3].strip()
                    res.append({"Year": year, "Doi": doi, "Title": title, "Order":order+1, "Name": name, "Affiliation": affiliation, "Alpha-3": alpha3, "Url":url, "Type":entryType, "Session": entrySession})

    df = pd.DataFrame(res)
    return df


dfs = []
for filename in glob.glob('./data-proceedings/*.md'):
    with open(filename, "r") as f:
        text = f.readlines()
        dfs.append(parseFile(text))

df = pd.concat(dfs)
df.Year = df.Year.astype(int)
df = df.sort_values(["Year", "Doi", "Order"])

df.to_csv("./data-proceedings/authors.csv", index=False)

In [2]:
print(f"Number of entries: {len(df)}")
print(f"Number of unique DOIs: {len(df.Doi.unique())}")
print(f"Number of unique authors: {len(df.Name.unique())}")
print(f"Number of unique affiliations: {len(df.Affiliation.unique())}")
print(f"Number of unique countries: {len(df['Alpha-3'].unique())}")
print(f"Number of unique types: {len(df.Type.unique())}")
print(f"Number of unique sessions: {len(df.Session.unique())}")
print(f"Years (n={len(df.Year.unique())}): {df.Year.unique()}")

Number of entries: 3714
Number of unique DOIs: 958
Number of unique authors: 2559
Number of unique affiliations: 497
Number of unique countries: 52
Number of unique types: 3
Number of unique sessions: 153
Years (n=20): [2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
 2019 2020 2021 2022 2023 2024]
