In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow_probability.substrates import numpy as tfp

In [None]:
!pip install cmdstanpy
!pip install git+https://github.com/OriolAbril/arviz.git@ci

In [2]:
from cmdstanpy import install_cmdstan
install_cmdstan()

CmdStan install directory: C:\Users\39339\.cmdstan
CmdStan version 2.33.1 already installed
Test model compilation


True

In [2]:
from cmdstanpy import CmdStanModel, set_cmdstan_path
import arviz as az

In [29]:
X = pd.read_csv("../Datasets/covariates.csv")

In [30]:
type_mapping = {'rural': 0, 'urban': 1, 'factory': 2}
X['Type'] = X['Type'].map(type_mapping)

In [31]:
Y = pd.read_csv("../Datasets/Dataset_120.csv")

In [32]:
rows_to_drop_Y = Y[Y['Count_120'].isna()].index

# Remove corresponding rows from dataset X
X = X.drop(index=rows_to_drop_Y)
Y = Y.drop(index=rows_to_drop_Y)
stations = X['Station']
years = X['Year']

# If you want to reset the index after dropping rows
X = X.reset_index(drop=True)
Y = Y.reset_index(drop=True)


In [7]:
prior_elic = """

data {
  int<lower=1> N; // Number of observations
  int<lower=1> P; // Covariate number (should be around 10)
  int<lower=1> nyears;
  int<lower=1> nstations;


  array[N] int<lower=1> station;
  array[N] int<lower=1> year;
  // array[N] int<lower=4, upper=10> month;

  array[N] int<lower=0> y; // Count data
  matrix[N, P] X; // Predictor matrix
}

parameters {
  vector[P] beta; // Coefficients for predictors
  vector[nstations] eta; // Random effects for comuni
  vector[nyears] xi; // Random effects for years

  //real<lower=0> sigma0; // Standard deviation for beta
  //real<lower=0> sigma1; // Standard deviation for xi
  //real<lower=0> sigma2; // Standard deviation for eta
}

transformed parameters {
    vector[N] lambda;
    vector[N] intercept;
    vector[N] fix_eff;

    intercept = xi[year] + eta[station];
    fix_eff = X * beta;

    lambda = exp(intercept + fix_eff);
}

model {

  beta ~ normal(0, 10);
  xi ~ normal(0, 10);
  eta ~ normal(0, 10);

  for (n in 1:N) {
    y[n] ~ poisson(lambda[n]);
  }

}

"""

stan_file = "./priors.stan"

with open(stan_file, "w") as f:
    print(prior_elic, file=f)

priors = CmdStanModel(stan_file=stan_file)

10:06:01 - cmdstanpy - INFO - compiling stan file C:\Users\leoma\OneDrive\DOCUME~1\PoliMi\BAYESI~1\BAYESI~1\main\Modello\priors.stan to exe file C:\Users\leoma\OneDrive\Documents\PoliMi\Bayesian statistics\Bayesian_Project\main\Modello\priors.exe
10:06:51 - cmdstanpy - INFO - compiled model executable: C:\Users\leoma\OneDrive\Documents\PoliMi\Bayesian statistics\Bayesian_Project\main\Modello\priors.exe


In [33]:
Y['Count_120'] = pd.to_numeric(Y['Count_120']).astype('Int64')

In [34]:
stations = X['Station']
years = X['Year']

In [46]:
print(stations.unique())
unique_values = sorted(set(stations))
mapping_dict = {value: index + 1 for index, value in enumerate(unique_values)}
transformed_stations = [mapping_dict[value] for value in stations]
stations = transformed_stations

[ 5707  5710  5717  5718  5719  5721  5725  5730  5732  5735  5738  5739
  5742  5749  5750  5952  5960  6582  6665  6691  6804  6818  6832  6884
  6904  9856  9861  9882  9899  9925  9972  9991  9997 10025 10041 10081
 10171 10270 10282 10288 10437 10454 10463 10584 12020]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3

In [36]:
print(years)

0       2010
1       2010
2       2010
3       2010
4       2010
        ... 
3835    2022
3836    2022
3837    2022
3838    2022
3839    2022
Name: Year, Length: 3840, dtype: int64


In [37]:
years = years - 2009
print(years)

0        1
1        1
2        1
3        1
4        1
        ..
3835    13
3836    13
3837    13
3838    13
3839    13
Name: Year, Length: 3840, dtype: int64


In [38]:
X = X.drop(['Year','Station'], axis  = 1)

In [54]:
sum(X['Quota'].isna())
X = X.drop('Quota', axis=1)

In [None]:
data = {
    "N": len(Y),
    "P": X.shape[1],
    "nyears": 13,
    "nstations": 45,
    "station": stations,
    "year": years,
    "y": Y['Count_120'],
    "X": X
}


fit = priors.sample(data=data, chains=4, parallel_chains=4, 
    iter_warmup=1000, iter_sampling=1000)
prior_az = az.from_cmdstanpy(fit)

10:26:00 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status