# Task B — Modeling & Task C — Anomalies

Implement model and anomaly detection here.

In [None]:

import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from pathlib import Path
pd.set_option('display.max_columns', 120)
plt.style.use('seaborn-v0_8')
DATA_DIR = Path('../dataset')
sup = pd.read_csv(DATA_DIR/'suppliers.csv')
prod = pd.read_csv(DATA_DIR/'products.csv')
prices = pd.read_csv(DATA_DIR/'price_lists.csv', parse_dates=['valid_from','valid_to'])
po = pd.read_csv(DATA_DIR/'purchase_orders.csv', parse_dates=['order_date','promised_date'])
deliv = pd.read_csv(DATA_DIR/'deliveries.csv', parse_dates=['actual_delivery_date'])
po = po.merge(deliv, on='order_id', how='left')
print(sup.shape, prod.shape, prices.shape, po.shape)


In [None]:

df = po.query('cancelled == 0').copy()
df['late_delivery'] = df['late_delivery'].fillna(0).astype(int)
cutoff = pd.Timestamp('2025-03-31')
train = df[df['order_date'] <= cutoff].copy()
valid = df[df['order_date'] > cutoff].copy()

def engineer(d):
    out = d.copy()
    out['promised_lead_days'] = (out['promised_date'] - out['order_date']).dt.days
    out['month'] = out['order_date'].dt.month
    out = out.merge(sup[['supplier_id','preferred','rating']], on='supplier_id', how='left')
    out = out.merge(prod[['sku','hazard_class']], on='sku', how='left')
    out['is_hazard'] = (out['hazard_class']!='none').astype(int)
    out['is_eur'] = (out['currency']=='EUR').astype(int)
    out = pd.get_dummies(out, columns=['ship_mode','incoterm','payment_terms'], drop_first=True)
    return out

X_train = engineer(train); X_valid = engineer(valid)
y_train = X_train['late_delivery']; y_valid = X_valid['late_delivery']
cols_drop = ['order_id','order_date','promised_date','actual_delivery_date','order_notes','sku','currency','hazard_class','late_delivery']
X_train = X_train.drop(columns=cols_drop, errors='ignore')
X_valid = X_valid.drop(columns=cols_drop, errors='ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, f1_score
clf = RandomForestClassifier(n_estimators=300, random_state=0, class_weight='balanced')
clf.fit(X_train, y_train)
p_valid = clf.predict_proba(X_valid)[:,1]
print('PR-AUC:', average_precision_score(y_valid, p_valid))
print('ROC-AUC:', roc_auc_score(y_valid, p_valid))


In [None]:

prices_ = prices.copy()
prices_['price_eur'] = np.where(prices_['currency']=='EUR', prices_['price_per_uom'], prices_['price_per_uom']/1.09)
results = []
for (sid, sku), g in prices_.groupby(['supplier_id','sku']):
    g = g.sort_values('valid_from').copy()
    x = np.log1p(g['price_eur'])
    med = np.median(x)
    mad = np.median(np.abs(x - med)) or 1e-6
    z = 0.6745*(x - med)/mad
    g['robust_z'] = z
    top = g.loc[g['robust_z'].abs().sort_values(ascending=False).head(3).index]
    for _, r in top.iterrows():
        results.append({'supplier_id': sid, 'sku': sku, 'valid_from': r['valid_from'], 'price_eur': r['price_eur'], 'robust_z': r['robust_z']})
import pandas as pd
pd.DataFrame(results).sort_values('robust_z', key=lambda s: s.abs(), ascending=False).head(10)
