# ANU ASTR4004 2024 - Week 8 (24+26 September 2024)

Author: Dr Sven Buder (sven.buder@anu.edu.au)

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Dimensionality-Reduction" data-toc-modified-id="Dimensionality-Reduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dimensionality Reduction</a></span></li><li><span><a href="#sklearn" data-toc-modified-id="sklearn-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>sklearn</a></span><ul class="toc-item"><li><span><a href="#sklearn.tree" data-toc-modified-id="sklearn.tree-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>sklearn.tree</a></span></li><li><span><a href="#sklearn.neighbors" data-toc-modified-id="sklearn.neighbors-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>sklearn.neighbors</a></span></li></ul></li></ul></div>

In [1]:
try:
    %matplotlib inline
    %config InlineBackend.figure_format='retina'
except:
    pass

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import patches

# Make the size and fonts larger for this presentation
plt.rcParams['font.size'] = 15
plt.rcParams['legend.fontsize'] = 12

## Dimensionality Reduction

In [None]:
DimensionalityReduction_Spectra_Images.ipynb
Images.tar

## sklearn

In [None]:
import pandas as pd
from scipy import stats

# https://exoplanet.eu/catalog/csv/  
LearningSet = pd.read_csv('data/phl_exoplanet_catalog.csv', sep = ',')
LearningSet.head()

In [None]:
final_features = LearningSet[['S_MASS', 'P_PERIOD', 'P_DISTANCE']]

targets = np.array(LearningSet['P_HABITABLE'],dtype=int)
targets[targets > 1] = 1

# gets rid of any instance with at least one NaN in any column
final_features = final_features.dropna(axis = 0)

# elimate 5-sigma outliers
final_features = final_features[(np.abs(stats.zscore(final_features)) < 5).all(axis=1)] 

# apply this selection onto targets as well
targets = targets[final_features.index]

# reset indices
final_features = final_features.reset_index(drop=True)

final_features.describe()

In [None]:
plt.figure(figsize=(10,6))

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#20B2AA','#FF00FF'])

a = plt.scatter(final_features['S_MASS'], final_features['P_PERIOD'], marker = 'o',\
            c = targets, s = 50, cmap=cmap, label = 'Test')

plt.legend();

plt.yscale('log')
plt.xlabel('Mass of Parent Star (Solar Mass Units)')
plt.ylabel('Period of Orbit (days)');

bluepatch = patches.Patch(color='#20B2AA', label='Not Habitable')
magentapatch = patches.Patch(color='#FF00FF', label='Habitable')

ax = plt.gca()
leg = ax.get_legend()

plt.legend(handles=[magentapatch, bluepatch],\
           loc = 'lower right', fontsize = 14);

In [None]:
# Select a training and test set:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(final_features, targets, random_state=2)
Xtrain.shape, Xtest.shape # by default 0.75:0.25

# We are now ready to fit the data with different models

### sklearn.tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [None]:
#This is how we specify which method we'd like to use, and any parameters.
model = DecisionTreeClassifier(random_state = 3)

#This tiny line is how we build models in sklearn.
model.fit(Xtrain, ytrain) 

# How accurate is this model?
print(metrics.accuracy_score(ytrain, model.predict(Xtrain))) #train score
print(metrics.accuracy_score(ytest, model.predict(Xtest))) #test score

In [None]:
# This looks pretty high, but how does it compare with the accuracy of a lazy classifier that places everything in the "not habitable" category?
print(metrics.accuracy_score(ytest, np.zeros(len(ytest)))) #performance of a dummy classifier

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.figure(figsize=(7,6))
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center", verticalalignment="center",
                 color="green" if i == j else "red", fontsize = 30)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
cm = metrics.confusion_matrix(ytest,model.predict(Xtest))

plot_confusion_matrix(cm, ['Not Hab','Hab'], cmap = plt.cm.Pastel2)

### sklearn.neighbors

In [None]:
from sklearn import neighbors

In [None]:
model = neighbors.KNeighborsClassifier(n_neighbors = 3)
model

In [None]:
# fit and predict:
model.fit(Xtrain.iloc[:,:2],ytrain) #this fits the model, which can then be used to predict labels for new instances
ytestpred = model.predict(Xtest.iloc[:,:2]) #this uses the fitted model to predict the labels for the 5 objects in test set

In [None]:
# Calculate the accuracy:

#This compares the true labels for the train set with the predicted labels for the train set:
print(metrics.accuracy_score(ytrain, model.predict(Xtrain.iloc[:,:2])))

#This compares the true labels for the test set with the predicted labels for the test set (same that we did above):
print(metrics.accuracy_score(ytest, model.predict(Xtest.iloc[:,:2])))

In [None]:
model.kneighbors(Xtest.iloc[:,:2]) #the first element gives the distances, the second the index of each neighbor