In [2]:


## Import Python libraries


import pandas as pd



import numpy as np



import matplotlib.pyplot as plt



import seaborn as sns



from sklearn.compose import ColumnTransformer



from sklearn.pipeline import make_pipeline



from sklearn.preprocessing import(
    
    
    
    FunctionTransformer,
    
    

    
    StandardScaler
    
    
)




from sklearn.model_selection import train_test_split




from sklearn.pipeline import Pipeline







This Python code trains a Poisson generalized linear model (GLM) with regularization




We use this model as a baseline and for feature selection, which we then use in the next modelling stage to improve lung cancer prevalence prediction



Before training the model, we will highlight the model settings





## Poisson GLM with regularization




We model count data $Y_{i}$ under the assumption:

\begin{align}
\mathbb{E}[Y_{i}] = A_{i} \lambda_{i},
\end{align}
where

\begin{align}
\lambda_i = \exp(\mathbf{x}_{i}^\top \boldsymbol{\beta})
\end{align}


Here, $A_{i}$ denotes the size of the area, while $\lambda_{i}$ is the intensity of events occurring within that area


In our case, $A_{i} = N_{i}$ represents the total population size, and $\lambda_i$ denotes the incidence rate of lung cancer per person in that population



The Poisson probability mass function is:


\begin{align}
P(Y_{i} = y_{i}) = \frac{(N_{i} \lambda_{i})^{y_{i}} e^{-N_{i} \lambda_{i}}}{y_{i}!}
= \frac{\left(N_{i} e^{\mathbf{x}_{i}^\top \boldsymbol{\beta}} \right)^{y_{i}} e^{-A_{i} e^{\mathbf{x}_{i}^\top \boldsymbol{\beta}}}}{y_{i}!}
\end{align}



Taking the log-likelihood over $n$ observations:


\begin{align}
\ell(\boldsymbol{\beta}) = \sum_{i=1}^n \left[
y_{i} \log N_{i} + y_{i} \mathbf{x}_{i}^\top \boldsymbol{\beta}
- N_{i} e^{\mathbf{x}_{i}^\top \boldsymbol{\beta}} - \log(y_{i}!)
\right]
\end{align}



## Regularization


Let $\alpha \ge 0$ and $\gamma \ge 0$.



$L_{2}$ penalty (Ridge regression)


\begin{align}
Penalty = \frac{\alpha}{2} \|\boldsymbol{\beta}\|_2^2
\end{align}
  
$L_{1}$ penalty (Lasso regression)

\begin{align}
Penalty = \alpha \|\boldsymbol{\beta}\|_1
\end{align}
  
$L_{1}$ plus $L_{2}$ penalty (Elastic Net Penalty)


\begin{align}
Penalty = \alpha \left[ \gamma \|\boldsymbol{\beta}\|_1 + \frac{1 - \gamma}{2} \|\boldsymbol{\beta}\|_2^2 \right]
\end{align}


## Objective Function

The regularized loss function to minimize is:

\begin{align}
\mathcal{L}(\boldsymbol\beta) = - \ell(\boldsymbol\beta) + \text{Penalty}
\end{align}

Using Elastic Net Penalty regularization:

\begin{align}
\mathcal{L}(\boldsymbol{\beta}) = - \sum_{i=1}^n \left[y_{i} \log N_{i} + y_{i} \mathbf{x}_{i}^\top \boldsymbol{\beta} - N_{i} e^{\mathbf{x}_{i}^\top \boldsymbol{\beta}} - \log(y_{i}!) \right] + \alpha \left[ \gamma \|\boldsymbol{\beta}\|_1 + \frac{1 - \gamma}{2} \|\boldsymbol{\beta}\|_2^2 \right]
\end{align}




    

In [None]:


x_train, x_test, y_train, y_test, N_train, N_test = train_test_split(x, y, N, test_size = 0.3, random_state = 420)





In [None]:



print("Train data")


print(y_train.shape)


print(x_train.shape)


print(N_train.shape)



print("Test data")


print(y_test.shape)


print(x_test.shape)


print(N_test.shape)

