# k plus proches voisins - en régression

Données : auto-mpg

disponible sur https://raw.githubusercontent.com/ContinuumIO/cdx/master/cdx/remotedata/auto-mpg.csv

In [None]:
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors,KNeighborsRegressor
from sklearn.preprocessing import scale
from sklearn.cross_validation import cross_val_score
from sklearn.utils import shuffle
from sklearn.decomposition import PCA,RandomizedPCA
from sklearn.linear_model import Ridge

In [None]:
# Fonctions

In [None]:
data = pd.read_csv('auto-mpg.csv')

In [None]:
# https://raw.githubusercontent.com/ContinuumIO/cdx/master/cdx/remotedata/auto-mpg.csv
data.drop(['origin','name'],axis=1,inplace=True)
data = data.rename(columns={'displ':'cc','cyl':'ncyl'})

In [None]:
data.head(2)

In [None]:
data.describe()

In [None]:
data.hist(figsize=(15,10))
pass

In [None]:
import seaborn as sea
sea.set(font_scale=1.5)
plt.figure(figsize=(8, 8))
sea.heatmap( data.corr() )

In [None]:
X = data[['weight','hp']].as_matrix().astype(np.float)
y = data['mpg'].as_matrix()
X,y = shuffle(X,y)

X0 = X[:,[0]]
X1 = X[:,[1]]

In [None]:
plt.scatter( X0,X1,c=y,cmap='jet')
plt.xlabel('weight')
plt.ylabel('hp')
plt.colorbar()

In [None]:
fig,(ax1,ax2) = plt.subplots(nrows=1,ncols=2)
ax1.scatter( X0,y ); ax1.set_xlabel('weight') ; ax1.set_ylabel('mgp')
ax2.scatter( X1,y ); ax2.set_xlabel('hp') ; ax2.set_ylabel('mgp')

### Avant de commencer: Ridge Regression


In [None]:
print(np.mean( cross_val_score( Ridge(alpha=0.0).fit(X,y) ,X,y) ))

X_augment = np.hstack((X,X**3,X0*X1))
print(np.mean( cross_val_score( Ridge(alpha=0.0).fit(X_augment,y) ,X_augment,y) ))

### Comparaison des y réels avec y prédits par régression ridge augmentée

In [None]:
fig,(ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(12,6))
ax1.scatter( X0,X1,c=y,cmap='jet')
ax1.set_xlabel('weight'); ax1.set_ylabel('hp')

ri = Ridge().fit(X_augment,y)
ax2.scatter( X0,X1,c=ri.predict(X_augment),cmap='jet')
ax2.set_xlabel('weight'); ax2.set_ylabel('hp')

### Prediction du 1-ppv

In [None]:
def show_1d_model(model,x,y):
    plt.scatter( x,y )
    z = np.linspace(min(x),max(x),80).reshape(80,1)
    plt.plot(z,model.predict(z),c='r')

show_1d_model( KNeighborsRegressor(n_neighbors=1).fit(X0,y) , X0 , y)

### Prediction du 10-ppv

In [None]:
show_1d_model( KNeighborsRegressor(n_neighbors=10).fit(X0,y) , X0 , y)

In [None]:
show_1d_model( KNeighborsRegressor(n_neighbors=100).fit(X0,y) , X0 , y)

In [None]:
show_1d_model( KNeighborsRegressor(n_neighbors=250).fit(X0,y) , X0 , y)

### Scores des plus-proches voisins

In [None]:
print( np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=30).fit(X1,y) ,X1,y) ) )
print( np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=30).fit(X1,y) ,X1,y) ) )
print( np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=30).fit(X,y) ,X,y) )   )

### Avec normalisation

In [None]:
X = scale(X)
print( np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=30).fit(X,y) ,X,y) ))

## Utilisation de l'ACP

In [None]:
fig,(ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize=(12,6))

Z = np.random.normal(size=(1000,1)).reshape(1000,1)
Z = np.hstack((Z*10.0,100+Z*15.0+5*np.random.normal(size=(1000,1))))
ax1.scatter( *Z.T )

zx,zy = PCA(whiten=True).fit(Z).transform(Z).T # Attention: l'option whiten assure que les variances seront à 1
ax2.scatter(zx,zy)

print ("moyennes =",np.mean(zx),"et",np.mean(zy))
print ("variances=",np.var(zx),"et",np.var(zy))

### Score en validation croisée sur des données après ACP

In [None]:
X_pca = RandomizedPCA(n_components=1,whiten=True).fit(X).transform(X)
print (np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=30).fit(X_pca,y) ,X_pca,y) ))

## Recherche du meilleur scaling

In [None]:
def local_PCA(X,y,k=10,percent=50):
    nn = NearestNeighbors(n_neighbors=k).fit(X)
    tous_voisins = nn.kneighbors(return_distance=False)
    local_vectors = np.empty((0,len(X.T)))

    for p,yp,ivoisins in zip(X,y,tous_voisins):
        d_yp_yvoisins = np.abs( y[ivoisins] - yp )
        meilleurs_voisins = ivoisins[d_yp_yvoisins < np.percentile(d_yp_yvoisins,percent)]
        local_vectors = np.vstack( (local_vectors , p - X[meilleurs_voisins] ))

    pca = RandomizedPCA(whiten=True).fit( local_vectors )
    return pca.transform(X)

# Testing local_PCA
Z = np.random.normal(size=(5000,2))
zy=  np.sin(15*Z[:,0] + -8*Z[:,1]  )

scores = []
for i in range(10):
    scores.append( np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=10).fit(Z,zy) ,Z,zy) ) )
    Z = local_PCA(Z,zy)
plt.plot(scores)

In [None]:
X_local = scale(X)
for i in range(20):
    print np.mean( cross_val_score( KNeighborsRegressor(n_neighbors=30).fit(X_local,y) ,X_local,y) )
    X_local = local_PCA(X_local,y,k=10,percent=30)
