In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
data = np.load('../data/sdss_photoz.npy')

In [None]:
data = data[~(data<0).any(axis=1)]
N = len(data)
X = np.zeros((N, 4))
X[:, 0] = data[:, 0] - data[:, 1] # u - g
X[:, 1] = data[:, 1] - data[:, 2] # g - r
X[:, 2] = data[:, 2] - data[:, 3] # r - i
X[:, 3] = data[:, 3] - data[:, 4] # i - z
z = data[:, 5] # redshift

In [None]:
np.savez('sdss_photoz_ugrizz.npz', photom=data[:,:-1], redshift=z)
np.savez('sdss_photoz_colorsz.npz', colors=X, redshift=z)

In [None]:
X_train, X_test, z_train, z_test = train_test_split(X, z, test_size=0.25, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(max_depth=20)
clf.fit(X_train, z_train)
z_pred = clf.predict(X_test)

In [None]:
rms = np.sqrt(np.mean((z_test - z_pred) ** 2))
print rms
print len(z_test)
print np.sum(abs(z_test - z_pred) > 1)

In [None]:
ax = plt.axes()
axis_lim = np.array([-0.01, 0.8])
plt.scatter(z_test, z_pred, s=10)# , c='k', lw=0, s=4)
plt.plot(axis_lim, axis_lim, '--k')

plt.title('Photo-z: Decision Tree Regression')
plt.xlabel(r'$\mathrm{z_{true}}$', fontsize=14)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14);

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
clf = RandomForestRegressor(100)

In [None]:
z_pred = clf.fit(X_train, z_train).predict(X_test)

In [None]:
rms = np.sqrt(np.mean((z_test - z_pred) ** 2))
print rms
print len(z_test)
print np.sum(abs(z_test - z_pred) > 1.)

In [None]:
clf.score(X_test, z_test)

In [None]:
ax = plt.axes()
axis_lim = np.array([-0.01, 0.8])
plt.scatter(z_test, z_pred, s=10)# , c='k', lw=0, s=4)
plt.plot(axis_lim, axis_lim, '--k')

plt.title('Photo-z: Random Forest Regression')
plt.xlabel(r'$\mathrm{z_{true}}$', fontsize=14)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14);

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
GradientBoostingRegressor?

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X)

In [None]:
X_train, X_test, z_train, z_test = train_test_split(X_pca, z, test_size=0.25, random_state=42)

In [None]:
clf = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100)

In [None]:
z_pred = clf.fit(X_train, z_train).predict(X_test)

In [None]:
clf.score(X_test, z_test)

In [None]:
rms = np.sqrt(np.mean((z_test - z_pred) ** 2))
print rms
print len(z_test)
print np.sum(abs(z_test - z_pred) > 1.)

In [None]:
ax = plt.axes()
axis_lim = np.array([-0.01, 0.8])
plt.scatter(z_test, z_pred, s=10)# , c='k', lw=0, s=4)
plt.plot(axis_lim, axis_lim, '--k')

plt.title('Photo-z: Gradient Boosting Regression')
plt.xlabel(r'$\mathrm{z_{true}}$', fontsize=14)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14);