In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, mean_absolute_error
import xgboost as xgb
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import os

# Ensure the models directory exists
os.makedirs("models", exist_ok=True)

# Load the synthetic dataset
data_path = r"C:\Users\Gebruiker\synthetic_insurance_dataset.csv"
df = pd.read_csv(data_path)

# Ensure column names are properly formatted
df.columns = df.columns.str.strip()

# Apply encoding only if not already encoded
if "Gender_Male" not in df.columns:  # Avoid duplicate encoding
    df = pd.get_dummies(df, columns=["Gender", "Policy_Type", "Risk_Score"], drop_first=True)

# Dimensionality Reduction
pca = PCA(n_components=10)
X_pca = pca.fit_transform(df.drop(columns=["Policy_ID", "Fraudulent_Claim", "Claim_Amount"]))
df_pca = pd.DataFrame(X_pca, columns=[f"PC{i}" for i in range(1, 11)])
df = pd.concat([df_pca, df[["Policy_ID", "Fraudulent_Claim", "Claim_Amount"]]], axis=1)

# Features and target for fraud detection
X_fraud = df.drop(columns=["Fraudulent_Claim", "Policy_ID"])
y_fraud = df["Fraudulent_Claim"]

# Split data for fraud detection
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Train Fraud Detection Model (Random Forest)
fraud_model = RandomForestClassifier(random_state=42)
fraud_model.fit(X_train_fraud, y_train_fraud)

# Save Fraud Detection Model
joblib.dump(fraud_model, "models/fraud_detection_model.pkl")

# Evaluate Fraud Model
y_pred_fraud = fraud_model.predict(X_test_fraud)
st.write("Fraud Detection Model Metrics:")
st.write(f"Accuracy: {accuracy_score(y_test_fraud, y_pred_fraud):.2f}")
st.write(f"Precision: {precision_score(y_test_fraud, y_pred_fraud):.2f}")
st.write(f"Recall: {recall_score(y_test_fraud, y_pred_fraud):.2f}")
st.write(f"AUC-ROC: {roc_auc_score(y_test_fraud, y_pred_fraud):.2f}")

# Features and target for risk assessment
X_risk = df.drop(columns=["Claim_Amount", "Policy_ID"])
y_risk = df["Claim_Amount"]

# Split data for risk assessment
X_train_risk, X_test_risk, y_train_risk, y_test_risk = train_test_split(X_risk, y_risk, test_size=0.2, random_state=42)

# Train Risk Assessment Model (XGBoost)
risk_model = xgb.XGBRegressor(random_state=42)
risk_model.fit(X_train_risk, y_train_risk)

# Save Risk Assessment Model
joblib.dump(risk_model, "models/risk_assessment_model.pkl")

# Evaluate Risk Model
y_pred_risk = risk_model.predict(X_test_risk)
st.write("Risk Assessment Model Metrics:")
st.write(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test_risk, y_pred_risk):.2f}")

# NLP Models
summarizer = pipeline("summarization")
sentiment_analyzer = pipeline("sentiment-analysis")
translator = pipeline("translation_en_to_fr")

def summarize_text(text):
    return summarizer(text, max_length=50, min_length=10, do_sample=False)[0]['summary_text']

def analyze_sentiment(text):
    return sentiment_analyzer(text)[0]

def translate_text(text):
    return translator(text)[0]['translation_text']

# Streamlit App
st.title("AI-Powered Insurance System")

# Sidebar for navigation
st.sidebar.title("Navigation")
options = ["Fraud Detection", "Risk Assessment", "NLP Analysis", "Visualizations"]
choice = st.sidebar.selectbox("Choose a module", options)

if choice == "Fraud Detection":
    st.header("Fraud Detection")
    policy_id = st.text_input("Enter Policy ID")
    
    if policy_id:
        policy_data = df[df["Policy_ID"] == policy_id].copy()
        
        if not policy_data.empty:
            policy_data = policy_data.drop(columns=["Fraudulent_Claim", "Policy_ID"], errors='ignore')
            missing_cols = set(X_train_fraud.columns) - set(policy_data.columns)
            for col in missing_cols:
                policy_data[col] = 0
            policy_data = policy_data[X_train_fraud.columns]
            prediction = fraud_model.predict(policy_data)
            st.write(f"Fraud Prediction: {'Fraudulent' if prediction[0] == 1 else 'Genuine'}")
        else:
            st.write("Policy ID not found.")

elif choice == "Risk Assessment":
    st.header("Risk Assessment")
    policy_id = st.text_input("Enter Policy ID")
    
    if policy_id:
        policy_data = df[df["Policy_ID"] == policy_id].copy()
        
        if not policy_data.empty:
            policy_data = policy_data.drop(columns=["Claim_Amount", "Policy_ID"], errors='ignore')
            missing_cols = set(X_train_risk.columns) - set(policy_data.columns)
            for col in missing_cols:
                policy_data[col] = 0
            policy_data = policy_data[X_train_risk.columns]
            prediction = risk_model.predict(policy_data)
            st.write(f"Predicted Claim Amount: ${prediction[0]:.2f}")
        else:
            st.write("Policy ID not found.")

elif choice == "Visualizations":
    st.header("Visualizations")
    if st.checkbox("Show Fraud Distribution"):
        fig, ax = plt.subplots()
        sns.countplot(x="Fraudulent_Claim", data=df, ax=ax)
        st.pyplot(fig)
    if st.checkbox("Show Claim Amount Distribution"):
        fig, ax = plt.subplots()
        sns.histplot(df["Claim_Amount"], kde=True, ax=ax)
        st.pyplot(fig)
    if st.checkbox("Show PCA Visualization"):
        fig, ax = plt.subplots()
        sns.scatterplot(x=df_pca["PC1"], y=df_pca["PC2"], hue=df["Fraudulent_Claim"])
        plt.xlabel("Principal Component 1")
        plt.ylabel("Principal Component 2")
        st.pyplot(fig)


elif choice == "NLP Analysis":
    st.header("NLP Analysis")
    text_input = st.text_area("Enter Text")
    
    if text_input:
        st.subheader("Summarization")
        st.write(summarize_text(text_input))
        
        st.subheader("Sentiment Analysis")
        st.write(analyze_sentiment(text_input))
        
        st.subheader("Translation (English to French)")
        st.write(translate_text(text_input))

        

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'

In [5]:
data_path = r"C:\Users\Gebruiker\synthetic_insurance_dataset.csv"
df = pd.read_csv(data_path)
df

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Policy_ID,Customer_Age,Gender,Policy_Type,Annual_Income,Claim_Amount,Risk_Score,Fraudulent_Claim
0,-0.224515,-0.648598,2.805591,0.569525,2.676771,0.685312,0.306675,1.147291,-2.145905,2.477879,POLICY_0,56,Other,Health,148614,49280,High,1
1,1.038548,-1.316803,2.303387,1.233593,4.972675,1.086905,-3.548788,0.342810,-0.469005,2.787109,POLICY_1,69,Other,Property,55431,14146,High,1
2,1.148988,1.794382,2.835693,-1.114858,-1.247120,-1.899268,-0.381470,0.837209,0.163223,-0.997959,POLICY_2,46,Other,Auto,86492,14654,Low,1
3,-1.177318,2.420294,-0.363514,-1.086646,-2.824750,-2.722123,-0.225266,0.515079,-3.317272,-2.801517,POLICY_3,32,Male,Auto,126722,36820,High,1
4,1.346717,0.089373,2.056613,-0.428365,0.285387,1.140716,0.721945,1.110152,2.426608,0.880496,POLICY_4,60,Other,Life,77684,1726,Medium,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-2.520259,0.866810,-1.258367,-0.051690,-3.178799,-2.773020,1.006616,-1.380548,-1.018613,-2.322042,POLICY_995,60,Male,Property,85398,5173,Low,1
996,0.200395,3.244450,-3.193443,0.222183,-5.098802,-1.758987,1.097709,1.312274,1.198744,-4.314869,POLICY_996,64,Male,Health,129291,44833,Low,1
997,-2.912178,0.281646,-1.480678,0.950130,-0.419412,-2.504918,-1.824500,0.122025,-4.838295,-1.104499,POLICY_997,62,Male,Life,33321,27400,Low,1
998,0.276948,1.802273,-0.541781,-2.593740,-4.243454,-0.708498,-1.371959,4.125930,-0.308262,-2.752850,POLICY_998,35,Female,Health,111348,5769,High,1
