In [None]:
# here is the data
!ls ../data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn.apionly as sns
%matplotlib inline

# Exercise 1

- load star/quasar photometry data
- train and evaluate using different classiefiers:
    - *sklearn.neighbors.KNeighborsClassifier*
    - *sklearn.ensemble.RandomForestClassifier*
    - *sklearn.linear_model.LogisticRegression*
    - *sklearn.svm.SVC*
    - *sklearn.tree.DecisionTreeClassifier*
    
- try different hyperparameters e.g. *penalty='l2'* in Log Reression, number of estimaters, depth of a tree, criterion method in RF, k in KNN, C and kernel='rbf' with gamma.

Use model? to see the help of what hyperparameters are available

How do these perform on the training set vs the test set? Which one is the best on the training set, which one is the best on the test set? What about KNN with k=1 or a deep DecisionTreeClassifier with max_depth=None

- What works better for training? Filters, color indices or all together?
- Try to scale your data to zero mean and unite standard deviation. Does the result change?

In [None]:
from pandas import read_csv
# Don't forget to convert pandas dataframe to numpy array data = np.array(dataframe)
data = read_csv('../data/sdss_photo.csv')
data.head(3)

In [None]:
X=np.array(data[['mag_u','mag_g']])
yy = np.array(data['target'])
np.unique(yy)

In [None]:
# Keep in mind that scikit-learn understand matrices and vectors of numbers
# Don't forget to convert targets to 0, 1
# you can use pandas, numpy, or sklearn.preprocessing.LabelEncoder (see the cheat sheet)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [1]:
# import mdoel
from sklearn ...

In [None]:
# initialise the model
model = 

In [None]:
# train on the train set
model.fit

In [None]:
# what does the model predict for the test set 
y_pred = model.predict

In [None]:
# how well it works...
np.sum((y_pred == y_test))/(1.*len(y_pred))

In [None]:
# the same
model.score(X_test, y_test)

# Exercise 2

- Is your model really good or you were just lucky? Try cross-validation, StratifiedKFold cross-validation.
- Check different accuracy metrics

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold, train_test_split

In [None]:
scores = cross_validate(LogisticRegression(class_weight='balanced'),
                        X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()

# Exercise 3

- load spectral lines dataset
- run your favourite classifier and check the performace per class
- what if you want to put stress on precise identifying of the type 4 spectral line (the rarest case).

Check the weight keyword

In [None]:
data = np.load('../data/spectral_lines.npz')
X = data['spec']
y = data['target']

In [None]:
from sklearn.metrics import average_precision_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Exercise 4

- Use the spectral lines dataset
- Run PCA to shrink the dimensionality of the dataset to keep 80% of the information (in the sense of variance)
- Run a classifier of your choice on the new dataset in lower dim space
- Compare the clasification scores between high dim dataset (n_components=all) and low dim(n_components=5)
- For visualisation purposes transform the data to 2D and use a scatter plot to visualise your classificatin result

- Then try *sklearn.manifold.TSNE* instead of PCA, both for classification and 2D visualisation
- Plot the learning curve. What does the result mean for learning?

In [None]:
# to plot the 2D scatter plot
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('nipy_spectral', 4))
plt.colorbar();

In [None]:
from sklearn.model_selection import learning_curve

def rms_error(model, X, y):
    y_pred = model.predict(X)
    return np.sqrt(np.mean((y - y_pred) ** 2))

def plot_with_err(x, data, **kwargs):
    mu, std = data.mean(1), data.std(1)
    lines = plt.plot(x, mu, '-', **kwargs)
    plt.fill_between(x, mu - std, mu + std, edgecolor='none',
                     facecolor=lines[0].get_color(), alpha=0.2)
    
def plot_learning_curve(clf, X, y, cv=5):
    train_sizes = np.linspace(0.05, 1., 10)
    N_train, val_train, val_test = learning_curve(clf, X, y, cv=cv, scoring=rms_error, shuffle=True)
    plot_with_err(N_train, val_train, label='training scores')
    plot_with_err(N_train, val_test, label='validation scores')
    plt.xlabel('Training Set Size'); plt.ylabel('rms error')
    plt.legend()

# Exercise 5
- Try to find 4 clusters in the dataset with 
- Try to enhance it with T-SNE to 2D and visualise the ground truth and predicted clusters

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# labels can be shuffled (permutated)
# we are hoping that majority is clustered correctly so we assing labels to be the most common true label in the group
from scipy.stats import mode

labels = np.zeros_like(clusters)
for i in range(4):
    mask = (clusters == i)
    labels[mask] = mode(y[mask])[0]

In [None]:
labels = est.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='rainbow')
plt.title('Prediceted clusters')
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow')
plt.title('True classes')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y, labels)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y, labels))

plt.imshow(confusion_matrix(y, labels),
           cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted');

# Exercise 6
- Use brutal force to find the best classifier hyperparamters
- Check SVC? what the hyperparameters mean

In [None]:
from sklearn.svm import SVC # You may use this one or DecisionTree because they are fast
from sklearn.model_selection import GridSearchCV

In [None]:
svc_params = {
    'C': np.logspace(-2, 2, 5), # from 10**-2, to 10**2
    'gamma': np.logspace(-4, 0, 5),
}

# Exercise 7
- load SDSS photometric redshift dataset (color indices or magnitudes in filters)
- create a regression model to calculate the best redshift estimator, *sklearn.ensemble.RandomForestRegressor*, *sklearn.ensemble.GradientBoostingRegressor*
- use cross-validation to evaluate the result

In [None]:
data = np.load('../data/sdss_photoz_colorsz.npz')
X = data['colors']
z = data['redshift']
# X = data['photom']
# z = data['redshift']

In [None]:
rms = np.sqrt(np.mean((z_test - z_pred) ** 2))

In [None]:
# plotting result

axis_lim = np.array([-0.01, 0.8])
plt.scatter(z_test, z_pred, s=10)
plt.plot(axis_lim, axis_lim, '--k')

plt.title('Photo-z: Decision Tree Regression')
plt.xlabel(r'$\mathrm{z_{true}}$', fontsize=14)
plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14);

# Exercise 8
- chose any dataset, run any of e.g. RandomForest, DecisionTree Regressor/Classifier or LinearRegression LogisticRegression and check the feature importance


In [None]:
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gbrt.fit(X_train, y_train)

print("accuracy on training set: %f" % gbrt.score(X_train, y_train))
print("accuracy on test set: %f" % gbrt.score(X_test, y_test))


plt.barh(range(X.shape[1]), gbrt.feature_importances_)
ax = plt.gca()
ax.set_position([0.4, .2, .9, .9])