# LÃ¶sung Lab 07: Datenaufbereitung & Aggregation

### Basis Aufgabe

In [23]:
import pandas as pd
# 1. Load & Clean
df = pd.read_csv("transactions_dirty.csv")

print(f"Original Length: {len(df)}")
print("Missing Values:\n", df.isna().sum())

# Strategy 1: Remove rows where account_id is missing
df_clean = df.dropna(subset=['account_id'])

# Strategy 2: Impute missing amounts with 0.0
df_clean['amount'] = df_clean['amount'].fillna(0.0)

# Fix Data Types (ID should be int, not float due to previous NaN)
df_clean['account_id'] = df_clean['account_id'].astype(int)

print(f"Cleaned Length: {len(df_clean)}")

Original Length: 20
Missing Values:
 date          0
account_id    0
amount        2
type          0
dtype: int64
Cleaned Length: 20


In [24]:
# 2. Merge (Left Join)
customers = pd.read_csv("customers.csv")

merged_df = pd.merge(
    df_clean,
    customers,
    on='account_id',
    how='left'
)

merged_df.head()

Unnamed: 0,date,account_id,amount,type,name,region
0,2024-01-01,101,0.0,withdrawal,Alice Corp,EU
1,2024-01-02,102,377.0,deposit,Bob Ltd,US
2,2024-01-03,101,2176.0,deposit,Alice Corp,EU
3,2024-01-04,999,15740.0,withdrawal,,
4,2024-01-05,101,203.0,deposit,Alice Corp,EU


In [25]:
# 3. Aggregation
print("\n--- Regional Stats ---")
regional_stats = merged_df.groupby('region')['amount'].agg(['sum', 'count'])
print(regional_stats)


--- Regional Stats ---
           sum  count
region               
EU      7216.0     11
US      2910.0      5


### Bonus Herausforderung

In [26]:
# 1. Custom Logic (Apply)
def classify_risk(row):
    if row['amount'] > 2000:
        return 'High Risk'
    return 'Standard'

merged_df['risk_class'] = merged_df.apply(classify_risk, axis=1)

print("--- Risk Classification ---")
print(merged_df[['amount', 'risk_class']].head())

--- Risk Classification ---
    amount risk_class
0      0.0   Standard
1    377.0   Standard
2   2176.0  High Risk
3  15740.0  High Risk
4    203.0   Standard


In [27]:
# 2. Time Series (Resample)
# Ensure Date is datetime index
merged_df['date'] = pd.to_datetime(merged_df['date'])
time_df = merged_df.set_index('date')

weekly_counts = time_df.resample('W').size()
print("\n--- Weekly Transaction Volume ---")
print(weekly_counts)


--- Weekly Transaction Volume ---
date
2024-01-07    7
2024-01-14    7
2024-01-21    6
Freq: W-SUN, dtype: int64


In [28]:
# 3. Pivot Table
print("\n--- Pivot Summary (Mean Amount) ---")
pivot = merged_df.pivot_table(
    values='amount',
    index='region',
    columns='risk_class',
    aggfunc='mean',
    margins=True
)
print(pivot)


--- Pivot Summary (Mean Amount) ---
risk_class  High Risk  Standard      All
region                                  
EU             2176.0     504.0  656.000
US                NaN     582.0  582.000
All            2176.0     530.0  632.875
