# Lösung Lab 06: Pandas & Performance

### Setup: Daten generieren
Wir erstellen einen größeren Datensatz (100.000 Zeilen), damit der Performance-Vergleich spürbar ist.

In [None]:
import pandas as pd
import numpy as np

# 100k Zeilen Dummy-Daten
num_rows = 100_000
data = {
    "id": range(1, num_rows + 1),
    "amount": np.random.uniform(10.0, 5000.0, num_rows).round(2),
    "currency": np.random.choice(["EUR", "USD", "GBP"], num_rows),
    "type": np.random.choice(["deposit", "withdrawal", "payment"], num_rows)
}

df_setup = pd.DataFrame(data)
df_setup.to_csv("big_transactions.csv", index=False)
print(f"Created 'big_transactions.csv' with {num_rows} rows.")

### Basis Aufgabe

In [None]:
import pandas as pd

# 1. Laden & Inspektion
df = pd.read_csv("big_transactions.csv")

print("--- Info ---")
df.info()

print("\n--- Head ---")
print(df.head())

In [None]:
# 2. Selektion (Boolean Indexing)
# Filter: Amount > 2000
high_value_tx = df[df['amount'] > 2000.00]

# Filter: USD AND Deposit
usd_deposits = df[(df['currency'] == 'USD') & (df['type'] == 'deposit')]

# 3. Reporting
print(f"\nHigh Value Transactions found: {len(high_value_tx)}")
print(f"USD Deposits found: {len(usd_deposits)}")

mean_val = high_value_tx['amount'].mean()
print(f"Average High Value Amount: {mean_val:.2f}")

### Bonus Herausforderung

In [None]:
import time

# 2. Der "Naive" Ansatz (Loop)
print("\nStarting Loop Calculation...")
start_loop = time.time()

fees_list = []
for index, row in df.iterrows():
    # Zugriff auf einzelne Zeile (langsam)
    fee = row['amount'] * 0.02
    fees_list.append(fee)

end_loop = time.time()
loop_duration = end_loop - start_loop
print(f"Loop Duration: {loop_duration:.4f} seconds")

# 3. Der "Pandas" Ansatz (Vektorisierung)
print("Starting Vectorized Calculation...")
start_vec = time.time()

# Berechnung auf der ganzen Spalte (schnell)
df['fee'] = df['amount'] * 0.02

end_vec = time.time()
vec_duration = end_vec - start_vec
print(f"Vector Duration: {vec_duration:.4f} seconds")

# 4. Vergleich
factor = loop_duration / vec_duration
print(f"\nVectorization was {factor:.1f}x faster!")