# LÃ¶sung Lab 07: Datenaufbereitung & Aggregation

### Setup: Daten generieren
Wir erstellen `transactions_dirty.csv` (mit fehlenden Werten) und `customers.csv`.

In [None]:
import pandas as pd
import numpy as np

# 1. Transactions (Dirty)
data_tx = {
    "date": pd.date_range(start="2023-01-01", periods=10, freq="D"),
    "account_id": [101, 102, 101, np.nan, 103, 102, 104, 101, 103, np.nan], # Missing IDs
    "type": ["deposit", "withdrawal", "payment", "deposit", "withdrawal", "deposit", "payment", "withdrawal", "deposit", "payment"],
    "amount": [1000.0, 50.0, np.nan, 200.0, 3000.0, 500.0, 25.50, np.nan, 4000.0, 100.0] # Missing Amounts
}
df_tx = pd.DataFrame(data_tx)
df_tx.to_csv("transactions_dirty.csv", index=False)

# 2. Customers (Stammdaten)
data_cust = {
    "account_id": [101, 102, 103],
    "name": ["Alice", "Bob", "Charlie"],
    "region": ["EU", "US", "EU"] # 104 fehlt -> testet Left Join
}
df_cust = pd.DataFrame(data_cust)
df_cust.to_csv("customers.csv", index=False)

print("Setup complete. Files created.")

### Basis Aufgabe

In [None]:
# 1. Load & Clean
df = pd.read_csv("transactions_dirty.csv")

print(f"Original Length: {len(df)}")
print("Missing Values:\n", df.isna().sum())

# Strategy 1: Remove rows where account_id is missing
df_clean = df.dropna(subset=['account_id']).copy()

# Strategy 2: Impute missing amounts with 0.0
df_clean['amount'] = df_clean['amount'].fillna(0.0)

# Fix Data Types (ID should be int, not float due to previous NaN)
df_clean['account_id'] = df_clean['account_id'].astype(int)

print(f"Cleaned Length: {len(df_clean)}")

# 2. Merge (Left Join)
customers = pd.read_csv("customers.csv")

merged_df = pd.merge(
    df_clean,
    customers,
    on='account_id',
    how='left'
)

print("\n--- Merged Data (Head) ---")
# Region will be NaN for account 104
print(merged_df.head())

# 3. Aggregation
print("\n--- Regional Stats ---")
regional_stats = merged_df.groupby('region')['amount'].agg(['sum', 'count'])
print(regional_stats)

### Bonus Herausforderung

In [None]:
# 1. Custom Logic (Apply)
def classify_risk(row):
    if row['amount'] > 2000:
        return 'High Risk'
    return 'Standard'

merged_df['risk_class'] = merged_df.apply(classify_risk, axis=1)

print("--- Risk Classification ---")
print(merged_df[['amount', 'risk_class']].head())

# 2. Time Series (Resample)
# Ensure Date is datetime index
merged_df['date'] = pd.to_datetime(merged_df['date'])
time_df = merged_df.set_index('date')

weekly_counts = time_df.resample('W').size()
print("\n--- Weekly Transaction Volume ---")
print(weekly_counts)

# 3. Pivot Table
print("\n--- Pivot Summary (Mean Amount) ---")
pivot = merged_df.pivot_table(
    values='amount',
    index='region',
    columns='risk_class',
    aggfunc='mean',
    margins=True
)
print(pivot)