In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_openml

In [2]:
df = fetch_openml(data_id=41214, as_frame=True).frame

In [3]:
df.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1.0,1.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
1,3.0,1.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82
2,5.0,1.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22
3,10.0,1.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72
4,11.0,1.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72


In [4]:
df["Frequency"] = df["ClaimNb"] / df["Exposure"]

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.compose import ColumnTransformer

log_scale_transformer = make_pipeline(
    FunctionTransformer(np.log, validate=False),
    StandardScaler()
)

linear_model_preprocessor = ColumnTransformer(
    [
        ("passthrough_numeric", "passthrough",
            ["BonusMalus"]),
        ("binned_numeric", KBinsDiscretizer(n_bins=10),
            ["VehAge", "DrivAge"]),
        ("log_scaled_numeric", log_scale_transformer,
            ["Density"]),
        ("onehot_categorical", OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
    ],
    remainder="drop",
)

In [6]:
df.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,Frequency
0,1.0,1.0,0.1,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82,10.0
1,3.0,1.0,0.77,D,5.0,0.0,55.0,50.0,B12,Regular,1217.0,R82,1.298701
2,5.0,1.0,0.75,B,6.0,2.0,52.0,50.0,B12,Diesel,54.0,R22,1.333333
3,10.0,1.0,0.09,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72,11.111111
4,11.0,1.0,0.84,B,7.0,0.0,46.0,50.0,B12,Diesel,76.0,R72,1.190476


In [7]:
import statsmodels.formula.api as smf
from statsmodels.discrete.discrete_model import Poisson as psn


In [8]:
df = pd.read_csv("https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv")


In [10]:
df.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [11]:
#[DIY] Run a linear regression (named model1), modeling "gear" with "mpg" (i.e. regress "mpg" on "gear")
model1 = smf.ols("gear ~ mpg", data = df).fit()
#[DIY] visualize the output of the model1 above
model1.summary()

0,1,2,3
Dep. Variable:,gear,R-squared:,0.231
Model:,OLS,Adj. R-squared:,0.205
Method:,Least Squares,F-statistic:,8.995
Date:,"Sat, 14 Aug 2021",Prob (F-statistic):,0.0054
Time:,14:23:05,Log-Likelihood:,-30.972
No. Observations:,32,AIC:,65.94
Df Residuals:,30,BIC:,68.87
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.5063,0.411,6.103,0.000,1.668,3.345
mpg,0.0588,0.020,2.999,0.005,0.019,0.099

0,1,2,3
Omnibus:,8.516,Durbin-Watson:,0.524
Prob(Omnibus):,0.014,Jarque-Bera (JB):,7.386
Skew:,1.147,Prob(JB):,0.0249
Kurtosis:,3.523,Cond. No.,74.1


In [12]:
#Now run a Poisson regression (named model2), modeling "gear" with "mpg" (i.e. regress "mpg" on "gear" by using a Poisson model)
model2 = psn.from_formula("gear ~ mpg", data=df).fit()

#[DIY] visualize the output of the model2 above
model2.summary()

Optimization terminated successfully.
         Current function value: 1.639026
         Iterations 4


0,1,2,3
Dep. Variable:,gear,No. Observations:,32.0
Model:,Poisson,Df Residuals:,30.0
Method:,MLE,Df Model:,1.0
Date:,"Sat, 14 Aug 2021",Pseudo R-squ.:,0.009774
Time:,14:23:25,Log-Likelihood:,-52.449
converged:,True,LL-Null:,-52.967
Covariance Type:,nonrobust,LLR p-value:,0.3089

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.9894,0.325,3.045,0.002,0.352,1.626
mpg,0.0155,0.015,1.027,0.305,-0.014,0.045
