
# Hands-On Lab: Claims Analytics & Fraud Detection

**Objectives**
- Explore, clean, and visualize claims data.
- Detect anomalies (potential fraud) using an unsupervised model.
- Summarize injury text fields.
- (Optional) Load and compare with the Kaggle *Easy Peasy* dataset.


In [None]:
#@title Install and import libraries
#!pip -q install pandas scikit-learn matplotlib nltk kaggle --upgrade

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import nltk
import os
nltk.download('punkt')
print("Setup complete.")


## 1) Load the Sample Dataset


In [None]:
# Try local dataset path first (relative to the notebook's working directory)
local_dataset_dir = os.path.join(os.getcwd(), "datasets")
local_csv_path = os.path.join(local_dataset_dir, "synthetic_sample_claims.csv")
    
# Fallback: absolute path to the repo dataset folder (if running from a subdir)
repo_root = "/Users/aronkondoro/Library/Mobile Documents/com~apple~CloudDocs/Projects/WCF"
fallback_csv_path = os.path.join(repo_root, "dataset", "synthetic_sample_claims.csv")

csv_path = local_csv_path if os.path.exists(local_csv_path) else fallback_csv_path
print(f"Loading CSV from: {csv_path}")

df = pd.read_csv(csv_path, parse_dates=["Date_Filed"]) 
df.head()


## 2) Quick Exploration


In [None]:
df.info()
display(df.describe(include='all'))

In [None]:
df['Suspected_Fraud'].value_counts(normalize=True).rename('share').to_frame()


## 3) Visualize Claims


In [None]:
plt.figure()
plt.scatter(df['Claim_Amount_TZS'], df['Processing_Time_Days'])
plt.xlabel("Claim Amount (TZS)")
plt.ylabel("Processing Time (days)")
plt.title("Claims: Amount vs Processing Time")
plt.show()

In [None]:
plt.figure()
df.groupby('Region')['Claim_Amount_TZS'].mean().sort_values().plot(kind='bar')
plt.title("Average Claim Amount by Region")
plt.ylabel("TZS")
plt.tight_layout()
plt.show()


## 4) Anomaly Detection (Unsupervised)
We'll use **IsolationForest** on numerical features to flag potentially unusual claims.


In [None]:
features = df[['Claim_Amount_TZS', 'Processing_Time_Days', 'Age']].copy()
model = IsolationForest(contamination=0.05, random_state=42)
df['anomaly'] = model.fit_predict(features)

outliers = df[df['anomaly'] == -1]
print(f"Flagged {len(outliers)} / {len(df)} claims as unusual (~{len(outliers)/len(df):.1%}).")
outliers[['Claim_ID','Claim_Amount_TZS','Processing_Time_Days','Sector','Channel','Suspected_Fraud']].head(10)

In [None]:
plt.figure()
plt.scatter(df['Claim_Amount_TZS'], df['Processing_Time_Days'], c=(df['anomaly']==-1).astype(int))
plt.xlabel("Claim Amount (TZS)")
plt.ylabel("Processing Time (days)")
plt.title("Outliers Highlighted")
plt.show()


## 5) Quick Text Summaries (NLP)


In [None]:
from nltk.tokenize import sent_tokenize

def summarize_text(text):
    sents = sent_tokenize(text or "")
    return sents[0] if sents else ""

df['Summary'] = df['Injury_Description'].apply(summarize_text)
df[['Claim_ID','Injury_Description','Summary']].head(5)