In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import scipy
import seaborn
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from scipy.stats import gaussian_kde
import sklearn as skl
from sklearn.preprocessing import StandardScaler

## Question 1 : generation des vecteurs aléatoires

In [2]:
n = 100
d = 2
x = np.random.rand(d,n)
residus = np.array([np.random.normal(0,1,size=n)]).T
theta_0 = np.ones((n,1))
theta_1 = np.ones((d,1))  
y = theta_0 + np.dot(x.transpose(),theta_1) + residus
pd.DataFrame(residus).plot.density() # on remarque que la distributiond es residus est bien gaussienne

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a143f4f98>

## Question 2 : affichage de la fonction $\rho(\alpha)$ pour trois valeurs de alpha

In [3]:
def ro(x, alpha):
    if abs(x)>alpha:
        return alpha*abs(x)-(alpha**2)/2
    else : return (x**2)/2
# Remarquons que la fonction est continue en alpha pour alpha >0

In [4]:
abscisses = np.linspace(-5,5)
ordonnees_alpha_05 = list(map(lambda x : ro(x,0.5), abscisses))
ordonnees_alpha_2 = list(map(lambda x : ro(x,2), abscisses))
ordonnees_alpha_5 = list(map(lambda x : ro(x,5), abscisses))

In [5]:
fig = plt.figure(figsize=(8, 6))
plt.plot(abscisses,ordonnees_alpha_05, label='alpha = 0.5')
plt.plot(abscisses,ordonnees_alpha_2, label='alpha = 2')
plt.plot(abscisses,ordonnees_alpha_5, label='alpha = 5')
plt.title('Fonction Rho de alpha')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

La fonction alpha est continue et derivable en 0 et alpha, ainsi prendre le gradient sera possible afin de trouver le $\theta$ minimisant l'estimateur du bruit $\rho$

## Question 3

In [6]:
alpha = 2
#y_pour_thetachapeau = y 
# Nous ne passons pas ici y, alpha en variable locale afin de permettre l'optimisation avec la fonction optimize
def theta_chapeau(theta_concatenated, alpha, y):
    theta0 = np.ones((n,1))*theta_concatenated[0]
    theta1 = np.ones((d,1))*np.array([[theta_concatenated[1]],[theta_concatenated[2]]])
    residus = list(y - theta0 - np.dot(x.transpose(), theta1))
    return float(sum(list(map(lambda t : ro(t, alpha), residus))))

def deconcatenation(theta_concatenated):
    print("partie constante =", theta_concatenated[0], "   partie variable =", theta_concatenated[1:])

In [7]:
def minimize_theta_chapeau(alpha, y):
    return minimize(theta_chapeau,(0,0,0), method='nelder-mead', args= (alpha, y) ,options={'xtol': 1e-5}).x

deconcatenation(minimize_theta_chapeau(2,y))

partie constante = 1.016702409    partie variable = [ 1.04839922  0.76279265]


## Question 4

In [8]:
# On crée un nouvel échatillon bootstrap de residus (tirés alléatoirements dans les residus precedement generés)
bootstrap0 = np.array([np.random.choice(residus.ravel(),n, replace=True)])

In [9]:
# On crée un nouvel y à partir du nouvel echantillon de residus bootstrap
y1 = theta_0 + np.dot(x.transpose(), theta_1) + bootstrap0.transpose()

In [10]:
deconcatenation(minimize_theta_chapeau(2,y1))

partie constante = 0.544670600181    partie variable = [ 1.54700454  0.97609888]


## Question 5

In [11]:
B = 20
V = np.zeros((B, 3))
for i in range(0, B):
    boot = np.array([np.random.choice(residus.ravel(), n, replace=True)])
    ylocal = theta_0 + np.dot(x.transpose(), theta_1) + boot.transpose()
    V[i] = minimize_theta_chapeau(2,ylocal)

In [12]:
print(np.cov(V).shape)

(20, 20)


In [14]:
V_alpha = np.ones((B, 3)) * theta_chapeau_alpha
b_boot = (1 / B) * sum(V - V_alpha)
eqmBoot = (1 / B) * np.trace(np.dot(V - V_alpha, (V - V_alpha).T))
# On fait la trace car le produit de la matrice porte 
print("Bboot = ", b_boot, "   EQMboot = ", eqmBoot)

NameError: name 'theta_chapeau_alpha' is not defined

## Question 6

In [None]:
def eqmboot(alpha_local):
    alpha = alpha_local
    V = np.zeros((B, 3))
    for i in range(0, B):
        boot = np.array([np.random.choice(residus.ravel(), n, replace=True)])
        ylocal = theta_0 + np.dot(x.transpose(), theta_1) + boot.transpose()
        V[i] = minimize_theta_chapeau(alpha_local, ylocal)
    return (1 / B) * np.trace(np.dot(V - V_alpha, (V - V_alpha).T))

In [None]:
x_alpha = np.array(range(0,20))*0.5
y_alpha = list(map(eqmboot, x_alpha))

In [None]:
plt.figure()
plt.plot(x_alpha, y_alpha)
plt.title('Erreur quadratique en fontion de alpha \n avec residus gaussiens')
plt.show()
np.amin(y_alpha)
# On remarque que pour un alpha supérieur à 0.5 on atteint une certaine stabilité.

## Question 7

In [None]:
residus_cauchy = np.random.standard_cauchy(n)
pd.DataFrame(residus_cauchy).plot.density() 
# une distribution plus dense au niveau de la moyenne, avec plus de valeurs extrèmes.

In [None]:
def eqmboot_cauchy(alpha_local):
    alpha = alpha_local
    V = np.zeros((B, 3))
    for i in range(0, B):
        boot = np.array([np.random.choice(residus_cauchy.ravel(), n, replace=True)])
        ylocal = theta_0 + np.dot(x.transpose(), theta_1) + boot.transpose()
        V[i] = minimize_theta_chapeau(2,ylocal)
    return (1 / B) * np.trace(np.dot(V - V_alpha, (V - V_alpha).T))

In [None]:
x_alpha_cauchy = np.array(range(0,20))*0.5
y_alpha_cauchy = list(map(eqmboot, x_alpha))

In [None]:
plt.figure()
plt.plot(x_alpha_cauchy, y_alpha_cauchy)
plt.title('Erreur quadratique en fonction de alpha avec residus cauchiens')
plt.show()
print(np.amin(y_alpha_cauchy))
# Il semble que avec une repartition des residus chauchienne il faille un alpha plus gran pour optimiser la regression

## Question 8

In [None]:
from sklearn.datasets.mldata import fetch_mldata
dataset_name = 'diabetes'
data = fetch_mldata(dataset_name)
xd = data.data.astype(float)
yd = data.target.astype(float)

In [None]:
scaler = StandardScaler()
scaler.fit(xd)
xd = scaler.transform(xd)
yd = np.array(pd.DataFrame(yd))

In [None]:
d = 8
n = 768
def somme_erreur_d(theta_concatenated, alpha, x, y):
    theta0 = np.ones((n,1))*theta_concatenated[0]
    theta1 = np.ones((1, d))*np.array([theta_concatenated[1:]])
    residus = yd - theta0 - np.dot(xd, theta1.T)
    return float(sum(list(map(lambda t : ro(t, alpha), residus.ravel()))))

def minimize_theta_chapeau_d(alpha, x, y):
    return minimize(somme_erreur_d,[0,0,0,0,0,0,0,0,0], method='nelder-mead',
                    args= (alpha, x, y) ,options={'xtol': 1e-5}).x

def eqmboot_d(alpha_local, x, theta, residus):
    V_alpha = np.ones((B, 9)) * theta_d
    theta_0 = theta_d[0]
    theta_1 = np.array(pd.DataFrame(theta_d[1:]))
    V = np.zeros((B, 9))
    for i in range(0, B):
        boot = np.array([np.random.choice(residus.ravel(), n, replace=True)])
        ylocal = theta_0 + np.dot(x, theta_1) + boot.transpose()
        V[i] = minimize_theta_chapeau_d(alpha_local, x, ylocal)
    return (1 / B) * np.trace(np.dot(V - V_alpha, (V - V_alpha).T))

In [None]:
theta_d = minimize_theta_chapeau_d(2, xd, yd) # on trouve un premier theta pour alpha = 2
residus = yd - theta_d[0] - np.dot(xd, np.array(pd.DataFrame(theta_d[1:])))
#on crée la base de residus servant au bootstrap

In [None]:
x_alpha_diabete = np.array(range(0,20))*0.2
y_alpha_diabete = list(map(lambda t : eqmboot_d(t, xd, theta_d, residus), x_alpha_diabete))

In [None]:
plt.figure()
plt.plot(x_alpha_diabete, y_alpha_diabete)
plt.title('Erreur quadratique en fonction de alpha à partir de la BDD de diabete')
plt.show()
print(np.amin(y_alpha_diabete))

# Partie 2

## Quesiton 9

In [None]:
import statsmodels.datasets as sd
data = sd.get_rdataset('airquality').data
data = data.dropna(axis=0, how='any')
y = np.array(data['Ozone'])
x = data.drop('Ozone', 1)
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)
data.head()

## Question 10

In [None]:
import numpy as np
from sklearn.linear_model.base import LinearModel
from sklearn.base import RegressorMixin

def stpforward(X, y, M):
    """Orthogonal Matching Pursuit model (OMP).
    X: Array-like, shape (n_samples, n_features).
    Training data.
    y: Array-like, shape (n_samples, ).
    Target values.
    M: Integer, in [1,n_features]
    """
    selected_variables = []
    residual = y
    p = X.shape[1]
    coef_selected = np.zeros(p)
    for i in range(1, M + 1):
        tab_alphaj = np.zeros(p)
        for j in range(0, p):
            if(j not in selected_variables):
                # Compute Alphaj and add it in tab_Alphaj
                Xj = X[:, j]
                # XXX: Get alpha_j value here
                valeur_alphaj = np.abs(Xj.dot(residual))
                tab_alphaj[j] = valeur_alphaj
        jmax = np.argmax(tab_alphaj)
        selected_variables.append(jmax)
        X_selected = X[:, selected_variables]
        # XXX: perform OLS over selected variables
        skl_linmod = skl.linear_model.LinearRegression()
        skl_linmod.fit(X_selected,residual)
        # Store coefficients
        coef_selected[selected_variables] = skl_linmod.coef_
        # XXX: Update residual
        residual= y - skl_linmod.predict(X_selected)
    return coef_selected, selected_variables

stpforward(x,y,2)

## Question 11

In [None]:
class MYOMP(LinearModel, RegressorMixin):
    """Orthogonal Matching Pursuit model (OMP).
    Parameters
    ----------
    n_nonzero_coefs : int, optional
    Desired number of non-zero entries in the solution. If None (by
    default) this value is set to 10% of n_features.
    """
    def __init__(self, n_nonzero_coefs=None, fit_intercept=False, 
                 normalize=False, precompute='auto'):
        self.fit_intercept = False
        self.normalize = normalize
        self.precompute = precompute
        self.n_nonzero_coefs = n_nonzero_coefs
        
    def fit(self, X, y):
        """Fit the model using X, y as training data.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
        Training data.
        y : array-like, shape (n_samples,) or (n_samples, n_targets)
        Target values.
        Returns
        -------
        self : object
        returns an instance of self.
        """
        if not self.n_nonzero_coefs :
            M = int(np.round(X.shape[1]*0.1))
        else:
            M = self.n_nonzero_coefs
        self.coef_ = np.zeros([X.shape[1], ])
        self.coef_ = stpforward(X,y,M)
        self.intercept_ = 0.
        return self

## Question 12

In [None]:
for i in range(1,6):
    myomp = MYOMP(n_nonzero_coefs=i)
    myomp.fit(x,y)
    print("MYOP pour M =", i, ": ", list(myomp.coef_))

## Question 13

In [None]:
from sklearn.linear_model import OrthogonalMatchingPursuit as OMP
OMP1=OMP(n_nonzero_coefs=1, fit_intercept=False)
OMP1.fit(x,y).coef_
OMP2=OMP(n_nonzero_coefs=2)
OMP2.fit(x,y).coef_
OMP3=OMP(n_nonzero_coefs=3)
OMP3.fit(x,y).coef_
OMP4=OMP(n_nonzero_coefs=4)
OMP4.fit(x,y).coef_
OMP5=OMP(n_nonzero_coefs=5)
OMP5.fit(x,y).coef_

## Question 14

In [None]:
from sklearn.linear_model import OrthogonalMatchingPursuitCV
ompcv = OrthogonalMatchingPursuitCV(cv = 3)
ompcv.fit(x, y)
ompcv.coef_

In [None]:
from sklearn.model_selection import GridSearchCV
M = np.array([1, 2, 3, 4, 5])
tuned_parameters = [{'n_nonzero_coefs': M}]
clf = GridSearchCV(OMP(), tuned_parameters, cv=3)
clf.fit(x,y)
print(clf.best_params_)