In [1]:
import polars as pl
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import gc

In [35]:
train_df = pl.read_csv("data/train.csv")
train_df.describe()

statistic,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
str,f64,str,f64,f64,f64,f64,str,str,f64,f64,f64,f64
"""count""",11504798.0,"""11504798""",11504798.0,11504798.0,11504798.0,11504798.0,"""11504798""","""11504798""",11504798.0,11504798.0,11504798.0,11504798.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,0.0,"""0""","""0""",0.0,0.0,0.0,0.0
"""mean""",5752398.5,,38.383563,0.998022,26.41869,0.462997,,,30461.370411,112.425442,163.897744,0.122997
"""std""",3321100.0,,14.993459,0.044431,12.99159,0.498629,,,16454.745205,54.035708,79.979531,0.328434
"""min""",0.0,"""Female""",20.0,0.0,0.0,0.0,"""1-2 Year""","""No""",2630.0,1.0,10.0,0.0
"""25%""",2876199.0,,24.0,1.0,15.0,0.0,,,25277.0,29.0,99.0,0.0
"""50%""",5752399.0,,36.0,1.0,28.0,0.0,,,31824.0,151.0,166.0,0.0
"""75%""",8628598.0,,49.0,1.0,35.0,1.0,,,39451.0,152.0,232.0,0.0
"""max""",11504797.0,"""Male""",85.0,1.0,52.0,1.0,"""> 2 Years""","""Yes""",540165.0,163.0,299.0,1.0


In [3]:
categorical_cols = [
    "Gender",
    "Driving_License",
    "Region_Code",
    "Previously_Insured",
    "Vehicle_Age",
    "Vehicle_Damage",
    "Policy_Sales_Channel",
    "Response",
]

In [4]:
def plot(df):

    sp_rows = 4
    sp_cols = 3
    sp_ids = [(r, c) for r in range(1, sp_rows + 1) for c in range(1, sp_cols + 1)]

    traces = []
    titles = []
    sp_ind = 0

    colors = px.colors.qualitative.Light24

    for column in df.columns:
        if column not in ["id"]:
            print(sp_ind, column)
            if column in categorical_cols or column == "Age" or column =="Vintage":
                agg = df[column].value_counts()
                traces.append(
                    (
                        go.Bar(x=agg[column], y=agg["count"], name=column, marker=dict(color=colors[sp_ind % len(colors)])),
                        sp_ids[sp_ind]
                    )
                )
            else:
                agg = df[column].hist(bin_count=100)
                bins = agg["breakpoint"].to_list()

                widths = [((j - i) / 2) for i, j in zip(bins, bins[1:-1])]
                width = sum(widths) / len(widths)

                positions = [b - width for b in bins]
                positions[-1] = positions[-2] + (2 * width)
                counts = agg["count"].to_list()
                traces.append(
                    (
                        go.Bar(x=positions, y=counts, name=column, hovertext=agg["category"],  marker=dict(color=colors[sp_ind % len(colors)])),
                        sp_ids[sp_ind]
                    )
                )


            titles.append(column)
            sp_ind += 1

    fig = make_subplots(rows=sp_rows, cols=sp_cols, subplot_titles=titles)
    for trace, sp in traces:
        fig.append_trace(trace, row=sp[0], col=sp[1])

    fig.update_xaxes(categoryorder="category ascending")
    fig.update_traces(dict(marker_line_width=0))
    fig.update_layout(
        height=1000, 
        width=1000, 
        title_text="Summary of columns", 
        template="plotly_dark")
    fig.show()
    gc.collect()

plot(train_df)

0 Gender
1 Age
2 Driving_License
3 Region_Code
4 Previously_Insured
5 Vehicle_Age
6 Vehicle_Damage
7 Annual_Premium
8 Policy_Sales_Channel
9 Vintage
10 Response


In [5]:
q1 = train_df["Annual_Premium"].quantile(0.25)
q3 = train_df["Annual_Premium"].quantile(0.75)
iqr = q3 - q1

filtered_df = train_df.filter(
    train_df["Annual_Premium"] > (q1 - 1.5 * iqr),
    train_df["Annual_Premium"] < (q1 + 1.5 * iqr)
)

plot(filtered_df)

0 Gender
1 Age
2 Driving_License
3 Region_Code
4 Previously_Insured
5 Vehicle_Age
6 Vehicle_Damage
7 Annual_Premium
8 Policy_Sales_Channel
9 Vintage
10 Response


In [36]:
train_df

id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
i64,str,i64,i64,f64,i64,str,str,f64,f64,i64,i64
0,"""Male""",21,1,35.0,0,"""1-2 Year""","""Yes""",65101.0,124.0,187,0
1,"""Male""",43,1,28.0,0,"""> 2 Years""","""Yes""",58911.0,26.0,288,1
2,"""Female""",25,1,14.0,1,"""< 1 Year""","""No""",38043.0,152.0,254,0
3,"""Female""",35,1,1.0,0,"""1-2 Year""","""Yes""",2630.0,156.0,76,0
4,"""Female""",36,1,15.0,1,"""1-2 Year""","""No""",31951.0,152.0,294,0
…,…,…,…,…,…,…,…,…,…,…,…
11504793,"""Male""",48,1,6.0,0,"""1-2 Year""","""Yes""",27412.0,26.0,218,0
11504794,"""Female""",26,1,36.0,0,"""< 1 Year""","""Yes""",29509.0,152.0,115,1
11504795,"""Female""",29,1,32.0,1,"""< 1 Year""","""No""",2630.0,152.0,189,0
11504796,"""Female""",51,1,28.0,0,"""1-2 Year""","""Yes""",48443.0,26.0,274,1


In [57]:
# Binarize Gender
normed_df = train_df.with_columns(
    pl
        .when(pl.col("Gender") == "Male")
        .then(0)
        .when(pl.col("Gender") == "Female")
        .then(1)
        .otherwise(None)
        .alias("Gender")
        .cast(pl.UInt8)
)
# Dummy Vehicle_Age
normed_df = normed_df.to_dummies(columns="Vehicle_Age")

# Binarize Vehicle_Damage
normed_df = normed_df.with_columns(
    pl
        .when(pl.col("Vehicle_Damage") == "Yes")
        .then(1)
        .when(pl.col("Vehicle_Damage") == "No")
        .then(0)
        .otherwise(None)
        .alias("Vehicle_Damage")
        .cast(pl.UInt8)
)
# Batch convert some binary columns to uint8
for col in ["Driving_License", "Previously_Insured", "Response"]:
    normed_df = normed_df.with_columns(
        pl
            .col(col)
            .cast(pl.UInt8)
            .alias(col)
    )

# Convert Region_Code and Policy_Sales_Channel to i64 (technically is a categorical column)
for col in ["Region_Code", "Policy_Sales_Channel"]:
    normed_df = normed_df.with_columns(
            pl
                .col(col)
                .cast(pl.Int64)
                .alias(col)
        )

normed_df

id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
i64,u8,i64,u8,i64,u8,u8,u8,u8,u8,f64,i64,i64,u8
0,0,21,1,35,0,1,0,0,1,65101.0,124,187,0
1,0,43,1,28,0,0,0,1,1,58911.0,26,288,1
2,1,25,1,14,1,0,1,0,0,38043.0,152,254,0
3,1,35,1,1,0,1,0,0,1,2630.0,156,76,0
4,1,36,1,15,1,1,0,0,0,31951.0,152,294,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
11504793,0,48,1,6,0,1,0,0,1,27412.0,26,218,0
11504794,1,26,1,36,0,0,1,0,1,29509.0,152,115,1
11504795,1,29,1,32,1,0,1,0,0,2630.0,152,189,0
11504796,1,51,1,28,0,1,0,0,1,48443.0,26,274,1
