In [2]:
# Install missing package for python-docx in the notebook environment
%pip install --quiet python-docx

from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
import datetime

doc = Document()
style = doc.styles['Normal']
style.font.name = 'Times New Roman'
style._element.rPr.rFonts.set(qn('w:eastAsia'), 'Times New Roman')
style.font.size = Pt(12)

# Title page
doc.add_heading('Milestone One — Auto Insurance Fraud Detection', 0).alignment = WD_ALIGN_PARAGRAPH.CENTER
p = doc.add_paragraph(
    'Student: Violette Similien Volodkevich\n'
    'Course: DX799 O1 Data Science Capstone (Fall 2025)\n'
    'Date: ' + datetime.datetime.now().strftime("%B %d, %Y")
)
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_page_break()

# Problem statement
doc.add_heading('1. Problem Statement', level=1)
doc.add_paragraph(
    "Auto insurance fraud imposes substantial costs on policyholders and insurers. "
    "This project develops predictive models that flag likely fraudulent claims using structured claim, policy, and incident data "
    "from the carclaims 12.csv dataset. The target variable is FraudFound (1 = fraud, 0 = non-fraud). "
    "The goal is to create interpretable and scalable models to reduce fraudulent payouts and improve claim accuracy."
)

# Data preparation
doc.add_heading('2. Data and Preparation', level=1)
doc.add_paragraph(
    "The dataset contains both numerical and categorical features such as claim details, vehicle characteristics, and policy information. "
    "Missing numerical values were imputed with the median, while missing categorical values were imputed with the mode. "
    "Categorical features were one-hot encoded and numerical features were standardized for models sensitive to scale. "
    "An 80/20 stratified train-test split was used to preserve class balance for FraudFound."
)

# Modeling summaries
doc.add_heading('3. Modeling Summaries (Weeks 1–6)', level=1)
weeks = [
    ("Week 1 — Polynomial & Interaction Terms",
     "Polynomial (degree=2) and interaction features modeled non-linear relationships. "
     "A logistic regression model was trained on these engineered features. Multicollinearity checked via VIF.",
     "[Insert ROC, PR, Confusion Matrix plots here]"),
    ("Week 2 — Regularization (Ridge, Lasso, Elastic Net)",
     "L1, L2, and Elastic Net regularization compared to reduce overfitting. Hyperparameters tuned via 5-fold CV, best model selected by ROC-AUC.",
     "[Insert plots here]"),
    ("Week 3 — Feature Selection & Dimensionality Reduction",
     "Forward/backward selection identified key predictors. PCA retained ~90% variance, followed by logistic regression on components.",
     "[Insert plots here]"),
    ("Week 4 — Logistic Regression & Feature Scaling",
     "Baseline classification using scaled numerics and encoded categoricals. Penalty and regularization tuned for optimal bias-variance tradeoff.",
     "[Insert plots here]"),
    ("Week 5 — Support Vector Machines (Kernels & Regularization)",
     "Linear and RBF kernels compared. Regularization (C) tuned by grid search; RBF slightly outperformed linear.",
     "[Insert plots here]"),
    ("Week 6 — Decision Trees & Random Forests",
     "Decision Trees produced interpretable rules; Random Forests reduced overfitting through ensembling. RF achieved the best overall AUC.",
     "[Insert ROC, PR, Confusion Matrix, Feature Importance plots here]")
]
for title, desc, placeholder in weeks:
    doc.add_heading(title, level=2)
    doc.add_paragraph(desc)
    doc.add_paragraph(placeholder)

# Deep dive
doc.add_heading('4. Deep Dive — Logistic Regression and Random Forest', level=1)
doc.add_paragraph(
    "The deep dive focused on Logistic Regression (Week 4) and Random Forest (Week 6). Logistic Regression offered transparency and interpretability, "
    "while Random Forest captured non-linear interactions. Both tuned using 5-fold CV. Random Forest achieved highest ROC-AUC; Logistic Regression remained an interpretable baseline."
)

# Overfitting
doc.add_heading('5. Overfitting and Hyperparameter Tuning', level=1)
doc.add_paragraph(
    "Overfitting controlled via cross-validation, regularization, and tree pruning. Parameters (C, α, depth) tuned using grid search. "
    "Final models validated on hold-out test set for generalization."
)

# Metrics table
doc.add_heading('6. Evaluation Metrics', level=1)
table = doc.add_table(rows=1, cols=5)
hdr = table.rows[0].cells
hdr[0].text, hdr[1].text, hdr[2].text, hdr[3].text, hdr[4].text = "Model", "Accuracy", "F1 Score", "ROC-AUC", "PR-AUC"
for wk in range(1,7):
    row = table.add_row().cells
    row[0].text = f"Week {wk}"
    row[1].text = row[2].text = row[3].text = row[4].text = "[ ]"

# EDA findings
doc.add_heading('7. Expected vs. Unexpected Findings & Role of EDA', level=1)
doc.add_paragraph(
    "EDA revealed strong correlations between incident severity, policy details, and fraud likelihood. Expected: higher fraud risk in single-vehicle accidents without police reports. "
    "Unexpected: some demographic groups showed elevated false positives."
)

# Conclusion
doc.add_heading('8. Conclusion and Next Steps', level=1)
doc.add_paragraph(
    "Random Forest delivered the best discrimination power, while Logistic Regression offered interpretability. "
    "Next steps include addressing class imbalance, applying SHAP for explainability, and integrating vehicle image data for hybrid detection."
)

# References
doc.add_heading('References', level=1)
refs = [
    "Aqqad, A. (2023). Insurance_claims. Mendeley Data. https://data.mendeley.com/datasets/992mh7dk9y/2",
    "Kapoor, K. (2023). Vehicle Insurance Fraud Detection. Kaggle. https://www.kaggle.com/datasets/khusheekapoor/vehicle-insurance-fraud-detection",
    "Humans In The Loop. (2023). Car Parts and Car Damages. Kaggle. https://www.kaggle.com/datasets/humansintheloop/car-parts-and-car-damages",
    "FBI. (2010). Insurance Fraud. FBI. https://www.fbi.gov/stats-services/publications/insurance-fraud",
    "Dell, E. (2024). How virtual inspections can benefit insurance customers. Digital Insurance. https://www.dig-in.com/news/how-virtual-inspections-can-benefit-insurance-customers"
]
for ref in refs:
    doc.add_paragraph(ref, style='List Bullet')

# Appendix
doc.add_page_break()
doc.add_heading('Appendix — Plots and Visuals', level=1)
doc.add_paragraph("Insert saved plots from each week's notebook (ROC, PR, Confusion Matrix, Feature Importance).")

# Save
output_file = "Volodkevich_Violette_MilestoneOne_Final.docx"
doc.save(output_file)
print(f"Saved: {output_file}")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Saved: Volodkevich_Violette_MilestoneOne_Final.docx
