In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load and preprocess data
def load_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    
    # Feature selection
    features = [
        'customer_age', 
        'vehicle_age', 
        'annual_mileage', 
        'no_claim_bonus', 
        'claim_history'
    ]
    
    X = df[features]
    y = df['premium_amount']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# Random Forest Model
def random_forest_model(X_train, y_train):
    rf_model = RandomForestRegressor(
        n_estimators=100, 
        random_state=42
    )
    rf_model.fit(X_train, y_train)
    return rf_model

# Fully Connected Neural Network
def create_fnn_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(
        optimizer=Adam(learning_rate=0.001), 
        loss='mean_squared_error'
    )
    return model

# Evaluate Model
def evaluate_model(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Model Performance:")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R² Score: {r2}")

# Main Execution
def premium_prediction(filepath):
    # Load and preprocess data
    X_train, X_test, y_train, y_test, scaler = load_preprocess_data(filepath)
    
    # Random Forest Prediction
    rf_model = random_forest_model(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)
    print("\nRandom Forest Results:")
    evaluate_model(y_test, rf_predictions)
    
    # FNN Model
    fnn_model = create_fnn_model(X_train.shape[1])
    fnn_model.fit(
        X_train, y_train, 
        epochs=50, 
        batch_size=32, 
        validation_split=0.2, 
        verbose=0
    )
    
    # FNN Predictions
    fnn_predictions = fnn_model.predict(X_test).flatten()
    print("\nFNN Results:")
    evaluate_model(y_test, fnn_predictions)

# Run prediction
premium_prediction('motor_insurance_dataset_756373.csv')


Random Forest Results:
Model Performance:
Mean Squared Error: 10357.321290345759
Mean Absolute Error: 77.06009098920971
R² Score: 0.662797347127162


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [1]:
import pandas as pd
import numpy as np
import random

def generate_motor_insurance_data(num_records=756373):
    np.random.seed(42)
    
    # Vehicle types with probabilities
    vehicle_types = ['Sedan', 'SUV', 'Hatchback', 'Luxury Car', 'Station Wagon', 
                     'Commercial Van', 'Electric Car', 'Motorcycle']
    
    # Insurance types
    insurance_types = ['Comprehensive', 'Third-Party']
    
    # Risk areas
    risk_areas = ['Urban', 'Rural']
    
    # Generate data
    data = {
        'customer_id': range(1, num_records + 1),
        'customer_age': np.random.randint(22, 65, num_records),
        'vehicle_type': np.random.choice(vehicle_types, num_records),
        'vehicle_age': np.random.randint(1, 7, num_records),
        'annual_mileage': np.random.randint(5000, 25000, num_records),
        'insurance_type': np.random.choice(insurance_types, num_records),
        'no_claim_bonus': np.round(np.random.uniform(0, 0.5, num_records), 2),
        'claim_history': np.random.randint(0, 4, num_records),
        'risk_area': np.random.choice(risk_areas, num_records)
    }
    
    # Calculate premium amount with some logic
    def calculate_premium(row):
        base_premium = 300  # Base premium
        
        # Age factor
        age_factor = 1 + (row['customer_age'] - 35) * 0.01
        
        # Vehicle type factor
        vehicle_factors = {
            'Sedan': 1.0, 
            'SUV': 1.2, 
            'Luxury Car': 1.5, 
            'Hatchback': 0.9,
            'Station Wagon': 1.1,
            'Commercial Van': 1.3,
            'Electric Car': 1.2,
            'Motorcycle': 0.8
        }
        vehicle_factor = vehicle_factors.get(row['vehicle_type'], 1.0)
        
        # Mileage factor
        mileage_factor = 1 + (row['annual_mileage'] - 15000) * 0.00005
        
        # No claim bonus reduction
        no_claim_reduction = 1 - row['no_claim_bonus']
        
        # Claim history impact
        claim_factor = 1 + (row['claim_history'] * 0.1)
        
        # Risk area factor
        risk_factor = 1.1 if row['risk_area'] == 'Urban' else 1.0
        
        # Insurance type factor
        insurance_factor = 1.3 if row['insurance_type'] == 'Comprehensive' else 1.0
        
        # Calculate premium
        premium = base_premium * age_factor * vehicle_factor * mileage_factor * \
                  no_claim_reduction * claim_factor * risk_factor * insurance_factor
        
        return round(premium, 2)
    
    # Add premium amount
    df = pd.DataFrame(data)
    df['premium_amount'] = df.apply(calculate_premium, axis=1)
    
    # Add postal code (German-style)
    df['postal_code'] = np.random.randint(10000, 99999, num_records)
    
    return df

# Generate and save dataset
df = generate_motor_insurance_data()
df.to_csv('motor_insurance_dataset_756373.csv', index=False)
print(f"Dataset generated with {len(df)} records")

Dataset generated with 756373 records
