# Importing the packages and data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
import sys
sys.path.insert(1, '../sar_dirichlet')
import dirichlet_regression
import dirichlet_regression_opti

In [4]:
from func_test import cos_similarity, create_features_matrices

In [5]:
from statsmodels.stats.correlation_tools import cov_nearest

# With two features

In [6]:
n_features = 2
n_classes = 3

In [7]:
np.random.seed(21)

beta = np.array([[0.  , 0. , .1],
                 [0.  , 1., -2.],
                 [0.  , -1., -2. ]])

gamma_var = np.array([2.,3.])

In [8]:
n_repeat = 100
list_n_samples = [50,200,1000]

In [9]:
cov_matrix = np.array([[1., 0.2], [0.2, 1.]])

# Estimation of the parameters

In [10]:
def diagonalize(M):
    eigenvalues, P = np.linalg.eig(M)
    D = np.diag(eigenvalues)
    return(P, D)

In [262]:
n_samples=1000
I = np.identity(n_samples)

If $W$ is diagonalizable, then $W = PDP^{-1}$ with $D$ a diagonal matrix.

\begin{align*}
    (I - \rho W)^{-1} &= (I - \rho PDP^{-1})^{-1}\\
    &= P P^{-1} (I - \rho PDP^{-1})^{-1} P P^{-1}\\
    &= P \left( P^{-1} (I - \rho PDP^{-1}) P \right)^{-1} P^{-1}\\
    &= P (I - \rho D)^{-1} P^{-1}
\end{align*}

We just need to compute the inverse of $(I - \rho D)$, which is straightforward because it is a diagonal matrix.

In [263]:
rho=0.9

X,Z,W = create_features_matrices(n_samples,n_features,choice_W='random_distance',nneighbors=10,cov_mat=cov_matrix)
#X,Z,W = create_features_matrices(n_samples,n_features,choice_W='X_dependant',nneighbors=10,cov_mat=cov_matrix)
Z[:,0] = 1
M = np.identity(n_samples) - rho*W

mu = dirichlet_regression.compute_mu_spatial(X, beta, M)
#phi = np.exp(np.matmul(Z,gamma_var))
phi = 15*np.ones(n_samples)
alpha = mu*phi[:,None]

Y = np.array([np.random.dirichlet(alpha_i) for alpha_i in alpha])
Y = (Y*(n_samples-1)+1/n_classes)/n_samples

In [149]:
%%time
P, D = diagonalize(W)

Wall time: 17.5 ms


In [150]:
np.count_nonzero(D - np.diag(np.diagonal(D)))

0

In [250]:
X,Z,W = create_features_matrices(100,n_features,choice_W='random_distance',nneighbors=10,cov_mat=cov_matrix)
P, D = diagonalize(W)

In [251]:
%%time
for _ in range(10):
    _ = np.linalg.solve(W,X)

Wall time: 7.58 ms


In [252]:
%%time
for _ in range(10):
    _ = scipy.sparse.linalg.spsolve(scipy.sparse.csc_matrix(W),X)

Wall time: 5.95 ms


In [247]:
scipy.sparse.linalg.spsolve(W,X)

array([[ 1.00000000e+00,  1.69085362e-01,  1.83208429e+01],
       [ 1.00000000e+00,  4.16044940e+02,  1.62099400e+02],
       [ 1.00000000e+00,  2.34437882e+01, -2.08899893e+02],
       ...,
       [ 1.00000000e+00,  4.37445091e+02,  2.70346265e+02],
       [ 1.00000000e+00,  2.96413876e+00, -4.52302075e+01],
       [ 1.00000000e+00, -1.67055035e+02, -6.45088825e+01]])

In [228]:
%%time
for _ in range(100):
    _ = np.linalg.solve(D,X)

Wall time: 3.31 s


In [229]:
real_D = np.real(D)

In [230]:
%%time
for _ in range(100):
    _ = np.linalg.solve(real_D,X)

Wall time: 1.39 s


In [157]:
np.unique(list_diag)

array([0])

In [74]:
%%time
P_inv = np.linalg.inv(P)

Wall time: 6.74 ms


In [44]:
np.linalg.inv(I - rho*W)

array([[2.59849232, 0.11146171, 0.01440941, ..., 0.02620055, 0.33420755,
        0.09144548],
       [0.31872717, 1.34901474, 0.03606924, ..., 0.02446734, 0.25647855,
        0.40477143],
       [0.08237921, 0.0925271 , 1.16257419, ..., 0.07508788, 0.22105144,
        0.27340195],
       ...,
       [0.02390978, 0.0103739 , 0.01099595, ..., 3.12372319, 0.07750869,
        0.01965522],
       [0.20136214, 0.08485114, 0.03056866, ..., 0.05898922, 1.54950461,
        0.11880582],
       [0.17490744, 0.29893785, 0.07426248, ..., 0.03225786, 0.24858217,
        1.40496413]])

In [45]:
np.real(np.matmul(P, np.matmul(np.linalg.inv(I-rho*D), P_inv)))

array([[2.59849232, 0.11146171, 0.01440941, ..., 0.02620055, 0.33420755,
        0.09144548],
       [0.31872717, 1.34901474, 0.03606924, ..., 0.02446734, 0.25647855,
        0.40477143],
       [0.08237921, 0.0925271 , 1.16257419, ..., 0.07508788, 0.22105144,
        0.27340195],
       ...,
       [0.02390978, 0.0103739 , 0.01099595, ..., 3.12372319, 0.07750869,
        0.01965522],
       [0.20136214, 0.08485114, 0.03056866, ..., 0.05898922, 1.54950461,
        0.11880582],
       [0.17490744, 0.29893785, 0.07426248, ..., 0.03225786, 0.24858217,
        1.40496413]])

In [23]:
%%time
D_inv = np.zeros(I.shape)
np.fill_diagonal(D_inv, 1/np.real(I-rho*D).diagonal())

Wall time: 0 ns


In [25]:
np.real(np.matmul(P, np.matmul(D_inv, P_inv)))

array([[ 1.39074331e+00,  4.19670823e-03,  1.21706347e-03, ...,
         1.15409611e-03,  1.53409768e-04,  5.99685601e-04],
       [ 2.46488888e-02,  1.37202962e+00,  1.30032154e-03, ...,
         2.61596271e-03,  1.84238182e-04,  9.40255350e-03],
       [-1.18978253e-03,  7.86939625e-03,  1.17415109e+00, ...,
         4.19075976e-01,  4.92451210e-02,  3.96931298e-02],
       ...,
       [-1.32839153e-03,  3.07444496e-03,  2.35786110e-01, ...,
         1.39529542e+00,  8.22684646e-02,  4.21870380e-02],
       [ 2.80791209e-04,  4.55745818e-05,  1.18203729e-02, ...,
         3.59092397e-02,  1.78380281e+00,  2.54673308e-02],
       [ 5.25149647e-05,  1.55550874e-03,  4.93138944e-03, ...,
         1.20492303e-02,  9.94326421e-03,  3.99793702e+00]])

In [24]:
%%time
for _ in range(100):
    _ = np.linalg.solve(D,P)

Wall time: 464 ms


In [31]:
%%time
D_inv = np.zeros(D.shape)
np.fill_diagonal(D_inv, 1/D.diagonal())

Wall time: 7.01 ms


  a.flat[:end:step] = val


In [31]:
%%time
for _ in range(100):
    MX1 = np.linalg.solve(M,X)

Wall time: 486 ms


In [45]:
P_inv_X = np.matmul(P_inv, X)

In [52]:
%%time
for _ in range(100):
    D_inv = np.zeros(I.shape)
    np.fill_diagonal(D_inv, 1/np.real(I-rho*D).diagonal())
    #MX2 = np.matmul( np.real(np.matmul(P, np.matmul(D_inv, P_inv))), X)
    MX2 = np.real(np.matmul(np.matmul(P, D_inv), P_inv_X))

Wall time: 126 ms


In [77]:
np.sum(Y,axis=0)

array([38.38067552, 76.35362904, 85.26569544])

### Tests with a non-symmetric matrix

In [136]:
W = pd.read_csv('Data Dirichlet/maupiti_W.csv', header=None)

In [137]:
%%time
P, D = diagonalize(W)

Wall time: 14.2 s


In [138]:
np.count_nonzero(D - np.diag(np.diagonal(D)))

0

In [117]:
np.matmul(P,np.matmul(D,np.linalg.inv(P)))

array([[-2.08166817e-16-2.18358318e-16j, -1.97758476e-16+1.34441069e-16j,
         1.00000000e-01-2.84060969e-17j, ...,
        -2.01525629e-08-2.04256825e-08j, -3.38271078e-16-3.24393290e-16j,
        -7.25114413e-16-1.83880688e-16j],
       [-1.06338549e-15-2.78368908e-16j,  1.38777878e-17+3.93294338e-16j,
         1.00000000e-01+1.24683250e-18j, ...,
        -1.30168076e-08+6.99113073e-09j,  1.94289029e-16-2.55004351e-16j,
        -5.26922256e-16-5.46492105e-16j],
       [-3.10515502e-16+6.25178078e-17j,  1.00000000e-01+1.40126355e-16j,
         6.38378239e-16-9.26450756e-17j, ...,
        -3.62242622e-09+1.28210816e-08j, -7.77156117e-16-2.93195372e-16j,
        -5.57713598e-16+1.44883292e-16j],
       ...,
       [-1.70002901e-16+2.03830008e-16j, -1.52113565e-16+6.41847686e-17j,
         4.47558657e-16-1.14708590e-16j, ...,
         2.78208640e-08-7.96901886e-09j,  8.15320034e-16+6.86299975e-16j,
         1.00000000e-01+4.92661467e-16j],
       [-1.77375475e-16+2.86446214e-16j, -3.

# Tests time for n=50

In [255]:
%%time
for _ in range(10):
    reg_spatial_opti = dirichlet_regression_opti.dirichletRegressor(spatial=True, maxfun=5000)
    reg_spatial_opti.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=0)

Wall time: 1.66 s


In [254]:
%%time
for _ in range(10):
    reg_spatial = dirichlet_regression.dirichletRegressor(spatial=True, maxfun=5000)
    reg_spatial.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=0)

Wall time: 3.62 s


# Tests time for n=200

In [259]:
%%time
for _ in range(10):
    reg_spatial_opti = dirichlet_regression_opti.dirichletRegressor(spatial=True, maxfun=5000)
    reg_spatial_opti.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=0)

Wall time: 9.78 s


In [260]:
%%time
#without sparse
for _ in range(10):
    reg_spatial = dirichlet_regression.dirichletRegressor(spatial=True, maxfun=5000)
    reg_spatial.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=0)

Wall time: 14.2 s


In [261]:
%%time
#with sparse
for _ in range(10):
    reg_spatial = dirichlet_regression.dirichletRegressor(spatial=True, maxfun=5000)
    reg_spatial.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=0)

Wall time: 9.77 s


# Tests time for n=1000

In [264]:
%%time
reg_spatial_opti = dirichlet_regression_opti.dirichletRegressor(spatial=True, maxfun=5000)
reg_spatial_opti.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=1)

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Wall time: 1min 22s


In [266]:
%%time
#without sparse
reg_spatial = dirichlet_regression.dirichletRegressor(spatial=True, maxfun=5000)
reg_spatial.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=1)


CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Wall time: 21 s


In [265]:
%%time
#with sparse
reg_spatial = dirichlet_regression.dirichletRegressor(spatial=True, maxfun=5000)
reg_spatial.fit(X, Y, parametrization='alternative', Z=Z, W=W, fit_intercept=False, verbose=1)


CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Wall time: 14.4 s


# Elections

In [13]:
from sklearn.preprocessing import StandardScaler

In [6]:
Y_occitanie = pd.read_csv('Data Dirichlet/occitanie/Y_occitanie.csv', sep=';')

In [7]:
X_occitanie = pd.read_csv('Data Dirichlet/occitanie/X_occitanie.csv', sep=';')

In [8]:
X_occitanie = X_occitanie.iloc[:,1:]
X_occitanie = X_occitanie.iloc[:,:-1]

In [9]:
coordinates = pd.read_csv('Data Dirichlet/occitanie/coordinates_cendroids.csv', sep=';')
distance_matrix = scipy.spatial.distance_matrix(coordinates,coordinates)

In [10]:
X = np.array(X_occitanie)
Y = np.array(Y_occitanie)

In [11]:
Z = np.ones((207,1))
gamma_0 = [0.]

In [14]:
X = StandardScaler().fit(X).transform(X)

In [15]:
n,K = X.shape
J = Y.shape[1]

In [16]:
W = np.copy(distance_matrix)
W[W > 35000] = 0

In [17]:
# inverse distance
W[W>0] = 1/W[W>0]

# row-normalize
W = W/W.sum(axis=1)[:,None]

In [21]:
X.shape

(207, 25)

In [None]:
%%time
dirichRegressor_s1 = dirichlet_regression.dirichletRegressor(spatial=True, maxiter=5000, maxfun=500000)
dirichRegressor_s1.fit(X, Y, W=W, parametrization='alternative', gamma_0=gamma_0, Z=Z)

In [25]:
%%time
dirichRegressor_s1 = dirichlet_regression_opti.dirichletRegressor(spatial=True, maxiter=5000, maxfun=500000)
dirichRegressor_s1.fit(X, Y, W=W, parametrization='alternative', gamma_0=gamma_0, Z=Z)

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Wall time: 1h 3min 30s
