### Dataset Description

Customer ID - A unique identifier for the customer
Shopping Point - Unique identifier for the shopping point of the customer
Record Type - The record type (0: shopping point, 1: purchase point)
Day - Day of the week when the shopping point was created
Time - Time of the day when the shopping point was created
State - The state where the shopping point was created
Location Coordinate - The location where the shopping point was created
Group Size - The size of the group the customer is shopping for
Homeowner - Whether the customer is a homeowner or not
Car Age - The age of the customer's car
Car Value - Value of the customer's car at purchase time
Risk Factor - The risk factor assigned to the customer
Age Oldest - Age of the oldest person in the customer's group
Age Youngest - Age of the youngest person in the customer's group
Married Couple - Indicates whether the group includes a married couple
C Previous - Previous Car Type
Duration Previous - The duration of the customer's previous insurance policy
A - Coverage level
B Smoking type
C - Car type
D - Purpose of the vehicle
E - Safety features
F - Driver's historic record
G - Area where the user will drive the car (rural, urban, suburban or hazardous)     
Cost - The cost of the insurance policy

## Code



In [None]:
# Install required packages
!pip install scikit-learn numpy pandas psycopg2-binary SQLAlchemy bokeh matplotlib seaborn plotly joblib cloudpickle snowflake-connector-python snowflake-sqlalchemy

from snowflake.connector.pandas_tools import write_pandas
import pandas as pd
import numpy as np
import snowflake.connector
import configparser
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from datetime import datetime, timedelta
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from scipy.stats import mode
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
import joblib
import cloudpickle
import warnings
import sklearn
import sqlalchemy
import matplotlib

warnings.filterwarnings('ignore')

# Function to check package versions
def check_versions():
    versions = {}
    versions['pandas'] = pd.__version__
    versions['numpy'] = np.__version__
    versions['sqlalchemy'] = sqlalchemy.__version__
    versions['sklearn'] = sklearn.__version__
    versions['datetime'] = datetime.now().isoformat()
    versions['scipy'] = mode.__module__
    versions['matplotlib'] = matplotlib.__version__
    versions['seaborn'] = sns.__version__
    versions['plotly'] = plotly.__version__
    versions['bokeh'] = 'Version info not directly accessible'
    versions['joblib'] = joblib.__version__
    versions['cloudpickle'] = cloudpickle.__version__

    return versions

# Print versions
versions = check_versions()
for package, version in versions.items():
    print(f"{package}: {version}")

# Additional detailed info for sklearn submodules
sklearn_submodules = {
    'model_selection': train_test_split.__module__,
    'preprocessing.StandardScaler': StandardScaler.__module__,
    'preprocessing.MinMaxScaler': MinMaxScaler.__module__,
    'preprocessing.LabelEncoder': LabelEncoder.__module__,
    'feature_selection.SelectFromModel': SelectFromModel.__module__,
    'cluster.KMeans': KMeans.__module__,
    'cluster.MiniBatchKMeans': MiniBatchKMeans.__module__,
    'ensemble.RandomForestRegressor': RandomForestRegressor.__module__,
    'tree.DecisionTreeRegressor': DecisionTreeRegressor.__module__,
    'linear_model.LinearRegression': LinearRegression.__module__,
    'svm.SVR': SVR.__module__,
    'neighbors.KNeighborsRegressor': KNeighborsRegressor.__module__,
    'neural_network.MLPRegressor': MLPRegressor.__module__,
    'ensemble.GradientBoostingRegressor': GradientBoostingRegressor.__module__,
    'metrics.mean_squared_error': mean_squared_error.__module__,
    'metrics.mean_absolute_error': mean_absolute_error.__module__,
    'metrics.r2_score': r2_score.__module__,
}

print("\nsklearn submodules:")
for submodule, version in sklearn_submodules.items():
    print(f"{submodule}: {version}")


In [None]:
# import pandas as pd
# import numpy as np
# from sqlalchemy import create_engine
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from datetime import datetime, timedelta
# from sklearn.feature_selection import SelectFromModel
# from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# from sklearn.cluster import KMeans
# from sklearn.cluster import MiniBatchKMeans
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.svm import SVR
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.neural_network import MLPRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from scipy.stats import mode
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
# from bokeh.plotting import figure, output_file, show, output_notebook
# from bokeh.models import HoverTool, ColumnDataSource
# import joblib
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
import psycopg2
import configparser
from sqlalchemy import create_engine
import pandas as pd

# Load database configuration from config file
config = configparser.ConfigParser()
config.read('db_config.ini')

host = config['database']['DB_HOST']
port = config['database'].getint('DB_PORT')
user = config['database']['DB_USER']
password = config['database']['DB_PASSWORD']
database = config['database']['DB_NAME']

# Create connection string
connection_string = f'postgresql://{user}:{password}@{host}:{port}/{database}'

# Connect to PostgreSQL
try:
    connection = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        database=database
    )

    cursor = connection.cursor()

    # Execute SQL query
    query = 'SELECT * FROM insurance_policy_data'
    cursor.execute(query)
    

    # Fetch all rows into a list of tuples
    rows = cursor.fetchall()

    # Convert the fetched data into a Pandas DataFrame
    columns = [desc[0] for desc in cursor.description]  # Get column names
    df1 = pd.DataFrame(rows, columns=columns)

    print("Data loaded successfully into DataFrame:")

except psycopg2.Error as e:
    print(f"Error connecting to PostgreSQL: {e}")

finally:
    if connection:
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

In [None]:
df1.head(3)

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
df1.describe().transpose()

In [None]:
duplicates_count = df1.duplicated().sum()

if duplicates_count > 0:
  print('Number of duplicate rows:', duplicates_count)
  print('Dropping duplicate rows...')
  df1 = df1.drop_duplicates()
  duplicates_count = df1.duplicated().sum()
print('Number of duplicate rows:', duplicates_count)

In [None]:
def calculate_missing_values(df1):
  missing_val = df1.isnull().sum()
  missing_val_percent = 100 * df1.isnull().sum() / len(df1)
  missing_values_table = pd.concat([missing_val, missing_val_percent], axis=1)
  print ("The dataframe has", str(df1.shape[1]), "columns and", str(df1.shape[0]), "Rows.\n")
  missing_values_table = missing_values_table.rename(columns = {0 : 'Missing Values ', 1 : '% of Total Values'})
  return missing_values_table

calculate_missing_values(df1)

In [None]:
# Set the figure size
plt.figure(figsize=(10, 8))

# Use Seaborn to create a heatmap of missing values
sns.heatmap(df1.isnull(), cbar=False, cmap='viridis')

# Set the title and labels
plt.title('Heatmap of Missing Values')
plt.xlabel('Columns')
plt.ylabel('Rows')

# Show the plot
plt.show()

In [None]:
def unique_values_with_counts(df1):
  for column in df1.columns:
    unique_vals = df1[column].value_counts()
    if len(unique_vals) <= 40:
      print(f"Column: {column}")
      print(unique_vals)
      print()

unique_values_with_counts(df1)

## State codes and names

state             state_name

0     FL                Florida

1     NY              New York

2     PA           Pennsylvania

3     OH                 Ohio

4     MD              Maryland

5     IN              Indiana

6     WA           Washington

7     CO              Colorado

8     AL              Alabama

9     CT         Connecticut

10    TN           Tennessee

11    KY            Kentucky

12    NV              Nevada

13    MO             Missouri

14    OR              Oregon

15    UT                Utah

16    OK            Oklahoma

17    MS          Mississippi

18    AR             Arkansas

19    WI           Wisconsin

20    GA              Georgia

21    NH      New Hampshire

22    NM          New Mexico

23    ME               Maine

24    ID                Idaho

25    RI       Rhode Island

26    KS             Kansas

27    WV       West Virginia

28    IA                 Iowa

29    DE           Delaware

30    DC  District of Columbia

31    MT            Montana

32    NE           Nebraska

33    ND      North Dakota

34    WY            Wyoming

35    SD      South Dakota

In [None]:
num_unique_customers = df1['customer_id'].nunique()
print(f"Number of unique customer IDs: {num_unique_customers}")

## Handling Missing Values


In [None]:
def fill_missing_and_replace_zero(df, columns, group_col='customer_id', replace_zero_columns=None):
    """
    Fills missing values in specified columns based on the mode of the grouped values.
    Replaces 0 values in specified columns with the mode of the grouped values.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    columns (list): List of columns to fill missing values.
    group_col (str): Column name to group by.
    replace_zero_columns (list): List of columns to replace 0 values.

    Returns:
    pd.DataFrame: DataFrame with missing values and zero values handled.
    """
    for column in columns:
        df[column] = df.groupby(group_col)[column].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x))
        # Special case for columns that should have 0 as default value if no mode is found
        if column in ['c_previous', 'duration_previous']:
            df[column].fillna(0, inplace=True)

    if replace_zero_columns:
        for column in replace_zero_columns:
            df[column] = df.groupby(group_col)[column].transform(lambda x: x.replace(0, x.mode()[0] if not x.mode().empty else x))

    return df

# Define the columns to fill missing values
columns_to_fill = [
    'state_code', 'location_coord', 'group_size', 'homeowner', 'car_age', 'car_value',
    'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous',
    'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G'
]

# Define columns to replace 0 values
columns_to_replace_zero = ['state_code']

# Fill missing values and replace zeros
df1 = fill_missing_and_replace_zero(df1, columns_to_fill, replace_zero_columns=columns_to_replace_zero)

In [None]:
df1.isnull().sum()

In [None]:
# Drop rows where 'shopping_pt' is null
df1.dropna(subset=['shopping_pt'], inplace=True)

In [None]:
# Sort the DataFrame by 'customer_id' and 'shopping_pt'
df1.sort_values(['customer_id', 'shopping_pt'], inplace=True)

# Group by 'customer_id' and apply cumcount(), which counts the number of occurrences within each group,
# starting from 0 and adding 1 to start from 1 instead of 0
df1['shopping_pt'] = df1.groupby('customer_id').cumcount() + 1

In [None]:
# Function to handle missing record_type values according to specified rules
def fill_missing_record_type(group):
    # Ensure group is sorted by 'shopping_pt'
    group = group.sort_values('shopping_pt').reset_index(drop=True)

    # Handle the last row separately
    if pd.isnull(group['record_type'].iloc[-1]):
        group['record_type'].iloc[-1] = 1

    # Handle the rest of the rows
    for i in range(len(group) - 1):
        if pd.isnull(group['record_type'].iloc[i]):
            group['record_type'].iloc[i] = 0

    return group

# Apply the function to each group of 'customer_id'
df1 = df1.groupby('customer_id', group_keys=False).apply(fill_missing_record_type)

# Reset index
df1.reset_index(drop=True, inplace=True)

In [None]:
def fill_missing_days(df1):
  # Forward fill missing values within each customer group
  df1['day'] = df1.groupby('customer_id')['day'].ffill()

  # Backward fill missing values within each customer group
  df1['day'] = df1.groupby('customer_id')['day'].bfill()

  # Handling edge cases of leading/trailing NaNs and isolated middle NaNs with different adjacent days
  for customer in df1['customer_id'].unique():
    customer_data = df1[df1['customer_id'] == customer]

    for i in range(1, len(customer_data) - 1):
      if pd.isnull(customer_data.iloc[i]['day']):
        prev_day = customer_data.iloc[i - 1]['day']
        next_day = customer_data.iloc[i + 1]['day']
        if prev_day != next_day:
          # Fill with the most frequent day within the customer's data
          most_frequent_day = customer_data['day'].mode().iloc[0]
          df1.loc[customer_data.index[i], 'day'] = most_frequent_day

  return df1

df1 = fill_missing_days(df1)

In [None]:
# Convert 'time' to datetime for easier manipulation
df1['time'] = pd.to_datetime(df1['time'], format='%H:%M:%S', errors='coerce')

# Sort dataframe by 'customer_id' and 'shopping_pt' for sequential processing
df1 = df1.sort_values(by=['customer_id', 'shopping_pt'])

# Function to handle missing time values according to specified rules
def fill_missing_times(group):
    # Ensure group is sorted by 'shopping_pt'
    group = group.sort_values('shopping_pt')

    n = len(group)

    # Handle first row
    if pd.isnull(group['time'].iloc[0]):
        if n > 1:
            group['time'].iloc[0] = group['time'].iloc[1] - pd.Timedelta(minutes=2)
        else:
            group['time'].iloc[0] = pd.Timestamp(group['day'].iloc[0]) + pd.Timedelta(hours=15, minutes=0, seconds=0)

    # Handle middle rows
    for i in range(1, n-1):
        if pd.isnull(group['time'].iloc[i]):
            if group['day'].iloc[i] == group['day'].iloc[i-1]:
                group['time'].iloc[i] = group['time'].iloc[i-1] + pd.Timedelta(minutes=2)
            elif group['day'].iloc[i] == group['day'].iloc[i+1]:
                group['time'].iloc[i] = group['time'].iloc[i+1] - pd.Timedelta(minutes=2)

    # Handle last row if more than one row exists
    if n > 1 and pd.isnull(group['time'].iloc[-1]):
        if group['day'].iloc[-1] == group['day'].iloc[-2]:
            group['time'].iloc[-1] = group['time'].iloc[-2] + pd.Timedelta(minutes=2)
        else:
            group['time'].iloc[-1] = pd.Timestamp(group['day'].iloc[-1]) + pd.Timedelta(hours=15, minutes=0, seconds=0)

    return group

# Apply the function to each group of 'customer_id'
df1 = df1.groupby('customer_id', group_keys=False).apply(fill_missing_times)

# Convert 'time' back to string format
df1['time'] = df1['time'].dt.strftime('%H:%M:%S')

# Reset index
df1.reset_index(drop=True, inplace=True)

In [None]:
df1.isnull().sum()

In [None]:
# Drop rows with NaN values in all columns except 'risk_factor'
df1 = df1.dropna(subset=[col for col in df1.columns if col != 'risk_factor'])

In [None]:
df1.isnull().sum()

## EDA

### Univariate Analysis

#### Bar Plots and Histograms

In [None]:
shopping_counts = df1['shopping_pt'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
shopping_counts.columns = ['Shopping Points', 'Count']

# Create a figure using Plotly Express
fig = px.bar(shopping_counts, x='Shopping Points', y='Count',
             labels={'Shopping Points': 'Shopping Points', 'Count': 'Count'},
             title='Distribution of shopping_pt')

fig.show()

In [None]:
day_counts = df1['day'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
day_counts.columns = ['Day', 'Count']

# Create a figure using Plotly Express
fig = px.bar(day_counts, x='Day', y='Count',
             labels={'Day': 'Day', 'Count': 'Count'},
             title='Distribution of day')

fig.show()

In [None]:
state_counts = df1['state_code'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
state_counts.columns = ['State', 'Count']

# Create a figure using Plotly Express
fig = px.bar(state_counts, x='State', y='Count',
             labels={'State': 'State', 'Count': 'Count'},
             title='Distribution of state')

fig.show()

In [None]:
group_size_counts = df1['group_size'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
group_size_counts.columns = ['Group Size', 'Count']

# Create a figure using Plotly Express
fig = px.bar(group_size_counts, x='Group Size', y='Count',
             labels={'Group Size': 'Group Size', 'Count': 'Count'},
             title='Distribution of group_size')

fig.show()

In [None]:
fig = px.histogram(df1, x=df1['car_age'], nbins=10, title=f'Distribution of car_age')
fig.show()

In [None]:
state_counts = df1['car_value'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
state_counts.columns = ['Car Value', 'Count']

# Create a figure using Plotly Express
fig = px.bar(state_counts, x='Car Value', y='Count',
             labels={'State': 'Car Value', 'Count': 'Count'},
             title='Distribution of Car Value')

fig.show()

In [None]:
fig = px.histogram(df1, x=df1['age_oldest'], nbins=10, title=f'Distribution of age_oldest')
fig.show()

In [None]:
fig = px.histogram(df1, x=df1['age_youngest'], nbins=10, title=f'Distribution of age_youngest')
fig.show()

In [None]:
C_previous_counts = df1['c_previous'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
C_previous_counts.columns = ['C previous', 'Count']

# Create a figure using Plotly Express
fig = px.bar(C_previous_counts, x='C previous', y='Count',
             labels={'C previous': 'C previous', 'Count': 'Count'},
             title='Distribution of c_previous')

fig.show()

In [None]:
duration_previous_counts = df1['duration_previous'].value_counts().reset_index()

# Rename columns to match expected by Plotly Express
duration_previous_counts.columns = ['Duration Previous', 'Count']

# Create a figure using Plotly Express
fig = px.bar(duration_previous_counts, x='Duration Previous', y='Count',
             labels={'Duration Previous': 'Duration Previous', 'Count': 'Count'},
             title='Distribution of duration_previous')

fig.show()

#### Pie CHart

In [None]:
fig = px.pie(df1, names='group_size', title='Group Size Distribution')
fig.show()

In [None]:
fig = px.pie(df1, names='homeowner', title='Homeowner Status Distribution')
fig.show()

In [None]:
fig = px.pie(df1, names='married_couple', title='Homeowner Status Distribution')
fig.show()

### Bivariate Analysis

#### Scatter Plot

In [None]:
# Prepare data
source = ColumnDataSource(data=dict(
    car_age=df1['car_age'],
    cost=df1['cost'],
    state_code=df1['state_code'],
    homeowner=df1['homeowner']
))

# Configure Bokeh output
output_file("scatter_plot.html")  # For saving the output to an HTML file
output_notebook()  # For displaying the output in a Jupyter Notebook

# Create scatter plot with hover tool
hover = HoverTool(tooltips=[
    ("Car Age", "@car_age"),
    ("Insurance Cost", "@cost"),
    ("State Code", "@state_code"),
    ("Homeowner", "@homeowner")
])

p = figure(width=800, height=600, tools=[hover], title="Interactive Scatter Plot")
p.circle('car_age', 'cost', size=10, source=source)

show(p)

#### Box Plot

In [None]:
# Create box plot with dropdown for state_code vs. cost
fig = px.box(df1, x='state_code', y='cost', title='State Code vs. Insurance Cost',
             labels={'state_code': 'State Code', 'cost': 'Insurance Cost'})

fig.update_layout(xaxis={'categoryorder': 'total ascending'})

fig.show()

In [None]:
# Create box plot with dropdown for state_code vs. cost
fig = px.box(df1, x='car_age', y='cost', title='Car Age vs. Insurance Cost',
             labels={'car_age': 'Car Age', 'cost': 'Insurance Cost'})

fig.update_layout(xaxis={'categoryorder': 'total ascending'})

fig.show()

#### Pairplot with Tooltips

In [None]:
# Create pairplot using seaborn
sns.set(style="ticks")
sns_plot = sns.pairplot(df1[['car_age', 'cost', 'homeowner']], diag_kind='kde')

# Convert seaborn pairplot to Plotly figure
fig = go.Figure()
fig.add_trace(go.Scatter(x=sns_plot.data.car_age, y=sns_plot.data.cost, mode='markers',
                         marker=dict(size=8, opacity=0.6),
                         text=df1['state_code'] + ', Homeowner: ' + df1['homeowner'].astype(str),
                         hoverinfo='text'))

fig.update_layout(title='Interactive Pairplot: Car Age vs. Insurance Cost',
                  xaxis_title='Car Age',
                  yaxis_title='Insurance Cost')

fig.show()

## Fill in Missing Values in Risk Factor

In [None]:
data = df1.copy()

In [None]:
df = data.copy()

In [None]:
df = df[df['car_age']<26]

In [None]:
df.shape

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
import cloudpickle

# Assuming 'df' is your DataFrame containing the relevant data

# Identify features for clustering
features = ['car_age', 'age_oldest', 'age_youngest', 'homeowner', 'group_size', 'married_couple', 'F', 'cost']

# Encode categorical features
le_state_code = LabelEncoder()
le_car_value = LabelEncoder()
df['state_code'] = le_state_code.fit_transform(df['state_code'])
df['car_value'] = le_car_value.fit_transform(df['car_value'])

# Create lookup DataFrames
state_code_mapping = pd.DataFrame({'original_value': le_state_code.classes_, 'encoded_value': range(len(le_state_code.classes_))})
car_value_mapping = pd.DataFrame({'original_value': le_car_value.classes_, 'encoded_value': range(len(le_car_value.classes_))})

# Save the lookup DataFrames as CSV files
state_code_mapping.to_csv('state_code_mapping.csv', index=False)
car_value_mapping.to_csv('car_value_mapping.csv', index=False)

# Prepare full set of features for clustering (both scaled numerical and categorical)
encoded_features = df[['state_code', 'car_value']]
X_processed = df[features + ['state_code', 'car_value']]

# Save the label encoders using cloudpickle
with open('le_state_code.pkl', 'wb') as f:
    cloudpickle.dump(le_state_code, f)

with open('le_car_value.pkl', 'wb') as f:
    cloudpickle.dump(le_car_value, f)

# Apply MiniBatchKMeans clustering for potentially faster convergence
kmeans = MiniBatchKMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_processed)

# Add clusters to the original dataframe
df['cluster'] = clusters

# Function to impute missing RISK_FACTOR using vectorized operations
def impute_risk_factor(df):
    # Calculate mode within each cluster and fillna in 'risk_factor'
    def calculate_mode(x):
        if not x.dropna().empty:
            mode_result = mode(x.dropna())
            if isinstance(mode_result.mode, np.ndarray):
                return mode_result.mode[0]
            else:
                return mode_result.mode
        else:
            return np.nan

    cluster_risk_factors = df.groupby('cluster')['risk_factor'].transform(calculate_mode)
    return df['risk_factor'].fillna(cluster_risk_factors)

# Apply imputation using vectorized function
df['risk_factor'] = impute_risk_factor(df)

# Drop the cluster column as it's no longer needed
df.drop(columns=['cluster'], inplace=True)

# Print a message to indicate the mappings have been saved
print("Lookup files for state code and car value have been saved as 'state_code_mapping.csv' and 'car_value_mapping.csv'.")

# import numpy as np
# import pandas as pd
# from sklearn.cluster import MiniBatchKMeans
# from scipy.stats import mode
# from sklearn.preprocessing import LabelEncoder
# import cloudpickle

# # Assuming 'df' is your DataFrame containing the relevant data

# # Identify features for clustering
# features = ['car_age', 'age_oldest', 'age_youngest', 'homeowner', 'group_size', 'married_couple', 'F', 'cost']

# # Encode categorical features
# le_state_code = LabelEncoder()
# le_car_value = LabelEncoder()
# df['state_code'] = le_state_code.fit_transform(df['state_code'])
# df['car_value'] = le_car_value.fit_transform(df['car_value'])

# # Prepare full set of features for clustering (both scaled numerical and categorical)
# encoded_features = df[['state_code', 'car_value']]
# X_processed = df[features + ['state_code', 'car_value']]

# # Save the label encoders using cloudpickle
# with open('le_state_code.pkl', 'wb') as f:
#     cloudpickle.dump(le_state_code, f)

# with open('le_car_value.pkl', 'wb') as f:
#     cloudpickle.dump(le_car_value, f)

# # Apply MiniBatchKMeans clustering for potentially faster convergence
# kmeans = MiniBatchKMeans(n_clusters=4, random_state=42)
# clusters = kmeans.fit_predict(X_processed)

# # Add clusters to the original dataframe
# df['cluster'] = clusters

# # Function to impute missing RISK_FACTOR using vectorized operations
# def impute_risk_factor(df):
#     # Calculate mode within each cluster and fillna in 'risk_factor'
#     def calculate_mode(x):
#         if not x.dropna().empty:
#             mode_result = mode(x.dropna())
#             if isinstance(mode_result.mode, np.ndarray):
#                 return mode_result.mode[0]
#             else:
#                 return mode_result.mode
#         else:
#             return np.nan

#     cluster_risk_factors = df.groupby('cluster')['risk_factor'].transform(calculate_mode)
#     return df['risk_factor'].fillna(cluster_risk_factors)

# # Apply imputation using vectorized function
# df['risk_factor'] = impute_risk_factor(df)

# # Drop the cluster column as it's no longer needed
# df.drop(columns=['cluster'], inplace=True)

In [None]:
df['risk_factor'].value_counts()

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
# df.to_csv('preprocessed_data.csv', index=False)

In [None]:
df = df.col

In [None]:
snowflake_df.columns = map(lambda x: str(x).upper(), df.columns)

In [None]:
snowflake_df

In [None]:
import pandas as pd
import configparser
from sqlalchemy import create_engine
import snowflake.connector

# Read the configuration file
config = configparser.ConfigParser()
config.read('snowflake_cred.ini')

# Get the Snowflake credentials
snowflake_config = config['snowflake']
user = snowflake_config['user']
password = snowflake_config['password']
account = snowflake_config['account']
warehouse = snowflake_config['warehouse']
database = snowflake_config['database']
schema = snowflake_config['schema']
table = snowflake_config['table']
role = snowflake_config['role']

# Create a Snowflake connection string for SQLAlchemy
conn = snowflake.connector.connect(
    user=user,
    password=password,
    account=account,
    warehouse=warehouse,
    database=database,
    schema=schema,
    role=role,
)

# Write the DataFrame to the Snowflake table
success, nchunks, nrows, _ = write_pandas(conn, snowflake_df, table)
 
if success:
    print(f"Successfully wrote {nrows} rows in {nchunks} chunks to the Snowflake table 'CARINSURANCEDATA'.")
else:
    print("Failed to write data to the Snowflake table.")
 
# Close the connection
conn.close()

## Feature Selection

In [None]:
df.head(2)

In [None]:
data1 = df.copy()
data2 = data1.copy()
data3 = data1.copy()
data4 = data1.copy()

### 1. Filter Methods - Correlation Matrix with Heatmap:

In [None]:
# Select only numeric columns
numeric_cols = data1.select_dtypes(include=[np.number])

# Calculate correlation matrix
corr_matrix = numeric_cols.corr()

# Plot heatmap
plt.figure(figsize=(20, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

### 2. Wrapper Methods - Recursive Feature Elimination (RFE):

#### Linear Regression

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Assuming your DataFrame is named 'data1'

# Select only numeric columns
numeric_cols = data2.select_dtypes(include=[np.number])

# Separate features and target variable
X = numeric_cols.drop('cost', axis=1)
y = numeric_cols['cost']

# Example using Linear Regression
model = LinearRegression()
rfe = RFE(model, n_features_to_select=16)
fit = rfe.fit(X, y)

print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

# Get the selected features
selected_features = X.columns[fit.support_]
print("Selected Features Names: %s" % selected_features)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Select only numeric columns
numeric_cols = data2.select_dtypes(include=[np.number])

# Separate features and target variable
X = numeric_cols.drop('cost', axis=1)
y = numeric_cols['cost']

# Example using Linear Regression
model = LinearRegression()
rfe = RFE(model, n_features_to_select=12)
fit = rfe.fit(X, y)

print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

# Get the selected features
selected_features = X.columns[fit.support_]
print("Selected Features Names: %s" % selected_features)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Select only numeric columns
numeric_cols = data2.select_dtypes(include=[np.number])

# Separate features and target variable
X = numeric_cols.drop('cost', axis=1)
y = numeric_cols['cost']

# Example using Linear Regression
model = LinearRegression()
rfe = RFE(model, n_features_to_select=16)
fit = rfe.fit(X, y)

print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

# Get the selected features
selected_features = X.columns[fit.support_]
print("Selected Features Names: %s" % selected_features)

In [None]:
data2.head(2)

### 3. Embedded Methods - Lasso Regression:

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import numpy as np

# Select only numeric columns
numeric_cols = data3.select_dtypes(include=[np.number])

# Separate features and target variable
X = numeric_cols.drop('cost', axis=1)
y = numeric_cols['cost']

# Lasso model
model = Lasso(alpha=0.01)
model.fit(X, y)

# Select from model
selector = SelectFromModel(model, prefit=True)
selected_features = selector.get_support()

# Get the names of the selected features
selected_feature_names = X.columns[selected_features]

print("Selected Features: %s" % selected_feature_names)

## DROP DUPLICATE ROWS

In [None]:
final_df = df[['state_code', 'group_size', 'homeowner', 'car_age', 'car_value', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'cost']]

duplicates_count = final_df.duplicated().sum()

if duplicates_count > 0:
  print('Number of duplicate rows:', duplicates_count)
  print('Dropping duplicate rows...')
  final_df = final_df.drop_duplicates()
  duplicates_count = final_df.duplicated().sum()
print('Number of duplicate rows:', duplicates_count)

In [None]:
final_df.shape

## Model Training and Evaluation

In [None]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

final_df = df[['state_code', 'group_size', 'homeowner', 'car_age', 'car_value', 'age_youngest', 'married_couple', 'c_previous',
                'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'cost']]

# Assuming `final_df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the sets of features to scale
features_to_scale_1 = ['car_age']
features_to_scale_2 = ['age_youngest']

# Create a dictionary to hold the scalers
scalers = {}

# Create a scaler instance for the first set of features
car_age_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale_1] = car_age_scaler.fit_transform(X_train[features_to_scale_1])
X_test_scaled[features_to_scale_1] = car_age_scaler.transform(X_test[features_to_scale_1])
scalers['car_age_scaler'] = car_age_scaler

# Save the first scaler using joblib
joblib.dump(car_age_scaler, 'car_age_scaler.joblib')

# Create a scaler instance for the second set of features
age_youngest_scaler = MinMaxScaler()
X_train_scaled[features_to_scale_2] = age_youngest_scaler.fit_transform(X_train[features_to_scale_2])
X_test_scaled[features_to_scale_2] = age_youngest_scaler.transform(X_test[features_to_scale_2])
scalers['age_youngest_scaler'] = age_youngest_scaler

# Save the second scaler using joblib
joblib.dump(age_youngest_scaler, 'age_youngest_scaler.joblib')

# Train the model
model = RandomForestRegressor(n_estimators=120, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

# Save the model
joblib.dump(model, 'model.joblib')

In [None]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

final_df = df[['state_code', 'group_size', 'homeowner', 'car_age', 'car_value', 'age_youngest', 'married_couple', 'c_previous',
                'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'cost']]

# Assuming `final_df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the sets of features to scale
features_to_scale_1 = ['car_age']
features_to_scale_2 = ['age_youngest']

# Create a dictionary to hold the scalers
scalers = {}

# Create a scaler instance for the first set of features
car_age_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale_1] = car_age_scaler.fit_transform(X_train[features_to_scale_1])
X_test_scaled[features_to_scale_1] = car_age_scaler.transform(X_test[features_to_scale_1])
scalers['car_age_scaler'] = car_age_scaler

# Save the first scaler using joblib
joblib.dump(car_age_scaler, 'car_age_scaler.joblib')

# Create a scaler instance for the second set of features
age_youngest_scaler = MinMaxScaler()
X_train_scaled[features_to_scale_2] = age_youngest_scaler.fit_transform(X_train[features_to_scale_2])
X_test_scaled[features_to_scale_2] = age_youngest_scaler.transform(X_test[features_to_scale_2])
scalers['age_youngest_scaler'] = age_youngest_scaler

# Save the second scaler using joblib
joblib.dump(age_youngest_scaler, 'age_youngest_scaler.joblib')

# Train the model
model = RandomForestRegressor(n_estimators=120, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

# Save the model
joblib.dump(model, 'model.joblib')

In [None]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

final_df = df[['state_code', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple', 'c_previous',
               'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'cost']]

final_df.head()

# Assuming `final_df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the sets of features to scale
features_to_scale_1 = ['car_age']
features_to_scale_2 = ['age_youngest']

# Create a dictionary to hold the scalers
scalers = {}

# Create a scaler instance for the first set of features
car_age_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale_1] = car_age_scaler.fit_transform(X_train[features_to_scale_1])
X_test_scaled[features_to_scale_1] = car_age_scaler.transform(X_test[features_to_scale_1])
scalers['car_age_scaler'] = car_age_scaler

# Save the first scaler using joblib
joblib.dump(car_age_scaler, 'car_age_scaler.joblib')

# Create a scaler instance for the second set of features
age_youngest_scaler = MinMaxScaler()
X_train_scaled[features_to_scale_2] = age_youngest_scaler.fit_transform(X_train[features_to_scale_2])
X_test_scaled[features_to_scale_2] = age_youngest_scaler.transform(X_test[features_to_scale_2])
scalers['age_youngest_scaler'] = age_youngest_scaler

# Save the second scaler using joblib
joblib.dump(age_youngest_scaler, 'age_youngest_scaler.joblib')

# Train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

# Save the model
joblib.dump(model, 'model.joblib')

### *** Model Training 00 - Random Forest Regressor (15 features) - (0.87536)

In [None]:
final_df = df[['state_code', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple', 'c_previous',
               'duration_previous', 'A', 'B', 'C', 'E', 'F', 'G', 'cost']]

# # Converting some columns from float to int
# columns_to_convert = ['state_code', 'group_size', 'homeowner', 'car_age', 'risk_factor', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous', 'A', 'B', 'C', 'E', 'F', 'G']

# final_df[columns_to_convert] = final_df[columns_to_convert].astype(int)

final_df.head()

In [None]:
final_df.dtypes

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Define the sets of features to scale
features_to_scale_1 = ['car_age']
features_to_scale_2 = ['age_youngest']

# Create a dictionary to hold the scalers
scalers = {}

# Create a scaler instance for the first set of features
car_age_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale_1] = car_age_scaler.fit_transform(X_train[features_to_scale_1])
X_test_scaled[features_to_scale_1] = car_age_scaler.transform(X_test[features_to_scale_1])
scalers['car_age_scaler'] = car_age_scaler

# Save the first scaler using joblib
joblib.dump(car_age_scaler, 'car_age_scaler.joblib')

# Create a scaler instance for the second set of features
age_youngest_scaler = MinMaxScaler()
X_train_scaled[features_to_scale_2] = age_youngest_scaler.fit_transform(X_train[features_to_scale_2])
X_test_scaled[features_to_scale_2] = age_youngest_scaler.transform(X_test[features_to_scale_2])
scalers['age_youngest_scaler'] = age_youngest_scaler

# Save the second scaler using joblib
joblib.dump(age_youngest_scaler, 'age_youngest_scaler.joblib')

# Save the dictionary of scalers as a pickle file if needed
joblib.dump(scalers, 'scalers.joblib')

In [None]:
X_train_scaled.dtypes

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Predict on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

In [None]:
# Save the model
joblib.dump(model, 'model.joblib')

### *** Model Training 0 - Random Forest Regressor (16 features) - (0.87489)

In [None]:
final_df = df[['record_type', 'state_code', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'C', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `final_df` is your DataFrame with the relevant features and target 'cost'
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Assume X_train and X_test are defined elsewhere in your script

# Define the sets of features to scale
features_to_scale_1 = ['car_age']
features_to_scale_2 = ['age_youngest']

# Create a dictionary to hold the scalers
scalers = {}

# Create a scaler instance for the first set of features
car_age_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale_1] = car_age_scaler.fit_transform(X_train[features_to_scale_1])
X_test_scaled[features_to_scale_1] = car_age_scaler.transform(X_test[features_to_scale_1])
scalers['car_age_scaler'] = car_age_scaler

# Create a scaler instance for the second set of features
age_youngest_scaler = MinMaxScaler()
X_train_scaled[features_to_scale_2] = age_youngest_scaler.fit_transform(X_train[features_to_scale_2])
X_test_scaled[features_to_scale_2] = age_youngest_scaler.transform(X_test[features_to_scale_2])
scalers['age_youngest_scaler'] = age_youngest_scaler

# Save the dictionary of scalers as a pickle file
with open('scalers.pkl', 'wb') as f:
    pickle.dump(scalers, f)

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Choose a model and train it
model = RandomForestRegressor(n_estimators=120, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Save the dictionary of model as a pickle file
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Predict on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

### Model Training 1 - Random Forest Regressor (13 features) SC-x, B-x, G-x - (0.8270)

In [None]:
final_df = df[['homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'E', 'F', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_oldest', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### Model Training 2 - Random Forest Regressor (15 features) 0.8737858

In [None]:
final_df = df[['state_code', 'record_type', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(n_estimators=120, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Predict on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

### Model Training 3 - DTR, LR, KNR, GBR (14 features) X


In [None]:
# final_df = df[['homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
#               'A', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Example usage of different regressors
models = {
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Linear Regression': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Iterate over models and fit/evaluate as needed
for name, model in models.items():
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model: {name}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R-squared Score (R2): {r2}")
    print()

### *** Model Training 4 - Random Forest Regressor (14 features) B-x, AO-x - (0.875296)

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### Model Training 5 - Random Forest Regressor (16 features) AO-x, CA-x, G-x (0.86228)

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'B', 'C', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### *** Model Training 6 - Random Forest Regressor (16 features) - (0.8767)

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'B', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Define the sets of features to scale
features_to_scale_1 = ['car_age']
features_to_scale_2 = ['age_youngest']

# Create a dictionary to hold the scalers
scalers = {}

# Create a scaler instance for the first set of features and fit it
car_age_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale_1] = car_age_scaler.fit_transform(X_train[features_to_scale_1])
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale_1] = car_age_scaler.transform(X_test[features_to_scale_1])
scalers['car_age_scaler'] = car_age_scaler

# Create a scaler instance for the second set of features and fit it
age_youngest_scaler = MinMaxScaler()
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale_2] = age_youngest_scaler.fit_transform(X_train[features_to_scale_2])
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale_2] = age_youngest_scaler.transform(X_test[features_to_scale_2])
scalers['age_youngest_scaler'] = age_youngest_scaler

# Save the dictionary of scalers as a pickle file
with open('scalers.pkl', 'wb') as f:
    pickle.dump(scalers, f)

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### *** Model Training 7 - Random Forest Regressor (14 features) B-x, G-x - (0.86778)

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'E', 'F', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_oldest', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### *** Model Training 8 - Random Forest Regressor (14 features) AO-x, B-x - (0.875296)

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### *** Model Training 9 - Random Forest Regressor (16 features) D-y, AO-x, B-x - (0.8737)

In [None]:
final_df = df[['state_code', 'record_type', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'D', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Step 2: Choose a model and train it
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Step 3: Evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute R2 score (coefficient of determination)
r2 = r2_score(y_test, y_pred)

# Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Mean Squared Error: {mse}")

### *** Model Training 10 - Random Forest Regressor (15 features) - (0.8737937)

In [None]:
final_df = df[['state_code', 'record_type', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `final_df` is your DataFrame with the relevant features and target 'cost'
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Choose a model and train it
best_model = RandomForestRegressor(n_estimators=120, random_state=42)
best_model.fit(X_train, y_train)

In [None]:
# Predict on training and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

### *** Model Training 11 - Random Forest Regressor (19 features) - (0.875317)

In [None]:
final_df = df[['state_code', 'record_type', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `final_df` is your DataFrame with the relevant features and target 'cost'
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Choose a model and train it
best_model = RandomForestRegressor(n_estimators=120, random_state=42)
best_model.fit(X_train_scaled, y_train)

In [None]:
# Predict on training and test sets
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

### Model Training 12 - XGBoost (16 features) X

In [None]:
# final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
#               'A', 'B', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_oldest', 'age_youngest']

# Create a scaler instance
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)

In [None]:
y_pred_xgb = xgb.predict(X_test_scaled)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

print("XGBoost:")
print(f"  Mean Absolute Error (MAE): {mae_xgb}")
print(f"  R-squared Score (R2): {r2_xgb}")
print(f"  Mean Squared Error: {mse_xgb}")

### Model Training 13 - LightGBM (15 features) X

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'B', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_oldest', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
import lightgbm as lgb

lgbm = lgb.LGBMRegressor()
lgbm.fit(X_train_scaled, y_train)

In [None]:
y_pred_lgbm = lgbm.predict(X_test_scaled)

mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)

print("LightGBM:")
print(f"  Mean Absolute Error (MAE): {mae_lgbm}")
print(f"  R-squared Score (R2): {r2_lgbm}")
print(f"  Mean Squared Error: {mse_lgbm}")

### Model Training 14 - CatBoost (15 features) X

In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'B', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_oldest', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from catboost import CatBoostRegressor

catboost = CatBoostRegressor(verbose=0)
catboost.fit(X_train_scaled, y_train)

In [None]:
y_pred_catboost = catboost.predict(X_test_scaled)

mae_catboost = mean_absolute_error(y_test, y_pred_catboost)
r2_catboost = r2_score(y_test, y_pred_catboost)
mse_catboost = mean_squared_error(y_test, y_pred_catboost)

print("CatBoost:")
print(f"  Mean Absolute Error (MAE): {mae_catboost}")
print(f"  R-squared Score (R2): {r2_catboost}")
print(f"  Mean Squared Error: {mse_catboost}")

### Model Training 15 - RFR, LR, KNR, GBR, MLPR (14 features) X


In [None]:
final_df = df[['state_code', 'homeowner', 'group_size', 'car_age', 'car_value', 'risk_factor', 'age_oldest', 'age_youngest', 'married_couple', 'c_previous', 'duration_previous',
              'A', 'B', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_oldest', 'age_youngest']

# Create a scaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Example usage of different regressors
models = {
    'Decision Trees': DecisionTreeRegressor(),
    'Linear Regression': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Iterate over models and fit/evaluate as needed
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  R-squared Score (R2): {r2}")
    print()

### Model Training 16 - Hyperparameter Tuning on Random Forest Regressor (15 features)

In [None]:
final_df = df[['state_code', 'record_type', 'group_size', 'homeowner', 'car_age', 'car_value', 'risk_factor', 'age_youngest', 'married_couple',
       'c_previous', 'duration_previous', 'A', 'E', 'F', 'G', 'cost']]
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame with the relevant features
X = final_df.drop('cost', axis=1)
y = final_df['cost']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Define the features to scale
features_to_scale = ['car_age', 'age_youngest']

# Create a scaler instance
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform training data
X_train_scaled = X_train.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])

# Transform test data using the scaler fitted on training data
X_test_scaled = X_test.copy()
X_test_scaled[features_to_scale] = scaler.transform(X_test[features_to_scale])

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
import time

param_grid = {
    'n_estimators': [50, 100, 120],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [8, None],
    'min_samples_split': [5, 8],
    'min_samples_leaf': [1, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=3)

start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Grid search completed in {elapsed_time // 60} minutes and {elapsed_time % 60} seconds")

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Model:", best_model)

In [None]:
# Predict on training and test sets
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# Compute Mean Absolute Error (MAE)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Compute R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Train MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

print(f"\nTrain R-squared Score: {train_r2}")
print(f"Test R-squared Score: {test_r2}")

print(f"\nTrain MSE: {train_mse}")
print(f"Test MSE: {test_mse}")

In [None]:
model = joblib.load('model.joblib')
if hasattr(model, '__version__'):
    print(f"Version of the loaded object: {model.__version__}")
else:
    print("No version information found.")