In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv("product_data.csv")
def extract_info(row):
    base = eval(row['base_info'])
    table = eval(row['table_contents'])
    return pd.Series({
        'Katalog': next((x for l, x in base if l=='Katalog-Nr.:'), ''),
        'Erhaltung': next((x for l, x in table if l=='Erhaltung:'), ''),
        'Provinz': next((x for l, x in table if l=='Provinz:'), ''),
        'Wert': next((x for l, x in table if l=='Wert:'), ''),
        'Periode': next((x for l, x in table if l=='Periode:'), ''),
        'Ort': next((x for l, x in table if l=='Ort:'), '')
    })

df = df.join(df.apply(extract_info, axis=1)).drop(columns=['base_info', 'table_contents'])
df = df[['Ort', 'price', 'Provinz', 'Wert', 'Periode', 'Erhaltung', 'Katalog', 'url', 'title']]

df['date'] = df['title'].str.extract(r'(\d{2}\.\d{2}\.\d{2})', expand=False)
df['date'] = df['date'].fillna(np.where(df['title'].str.contains('oD'), 'oD', 'NoDate'))
df['title'] = df.apply(lambda r: r['title'].replace(r['date'], ''), axis=1)

df.to_csv("NotgeldData.csv", index=False)

In [None]:
df = pd.read_csv('NotgeldData.csv')
df.columns = df.columns.str.title()
df.drop(['Katalog', 'Url', 'Date'], axis=1, inplace=True)

# Clean Title based on Ort
first, second = df['Title'].str.split(',', n=1, expand=True).fillna('')
df.loc[first.isin(df['Ort']), 'Title'] = second

# Determine Issuer from Title
def get_issuer(title):
    part = title.split(', ')[0]
    return part if not any(char.isdigit() for char in part) else None

df['Issuer'] = df['Title'].apply(get_issuer).str.lstrip().fillna('NoIssuerInfo')
df['Title'] = df.apply(lambda r: r['Title'].replace(r['Issuer'], '') if r['Issuer'] in r['Title'] else r['Title'], axis=1)
df['Title'] = df.apply(lambda r: r['Title'].replace(str(r['Wert']), ''), axis=1)

# Extract Date from Title
pattern = r'\b(\d{1,2}\.\d{1,2}\.\d{2})\b'
df['Date'] = df['Title'].str.extract(pattern, expand=False)
df['Date'] = df.apply(lambda r: "oD" if pd.isnull(r['Date']) and "oD" in r['Title'] else r['Date'], axis=1).fillna("NoDate")
df['Title'] = df.apply(lambda r: r['Title'].replace(r['Date'], '') if r['Date'] in r['Title'] else r['Title'], axis=1)

df = df[['Ort', 'Provinz', 'Wert', 'Date', 'Periode', 'Price', 'Erhaltung', 'Issuer', 'Title']]
df.to_excel('Notgeld_V2.xlsx', index=False)

In [None]:
# Delete the spaces in Wert column.
df['Wert'] = df['Wert'].str.replace(' ', '')
# Replace the Mio. with 000000. Mrd with 000000000. Bio with 000000000000. T with 000000000000000.
df['Wert'] = df['Wert'].str.replace('Mio', '000000', regex=False)
df['Wert'] = df['Wert'].str.replace('Mrd', '000000000', regex=False)
df['Wert'] = df['Wert'].str.replace('Bio', '000000000000', regex=False)
df['Wert'] = df['Wert'].str.replace('T', '000000000000000', regex=False)
# Replace Mk with 000. and Pf with 0.
df['Wert'] = df['Wert'].str.replace('Mk', '000', regex=False)
df['Wert'] = df['Wert'].str.replace('Pf', '0', regex=False)
# How many columns still have a character?
df['Wert'].str.contains('[a-zA-Z]').sum()
# Delete the rows with a character.
df = df[~df['Wert'].str.contains('[a-zA-Z]')]
# Save the data.
df.to_csv('Analysis.csv', index=False)