In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats

# Star/quasar classification

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('../data/sdss_photo.csv')
data.head(3)

In [None]:
dataArray = np.array(data)
# features & targets
X = dataArray[:,6:].astype(float)
labels = dataArray[:,5]
print X.shape, labels.shape

In [None]:
yFreq = scipy.stats.itemfreq(labels)
print yFreq

In [None]:
# turn string labels to 0, 1
le = LabelEncoder()
le.fit(labels)
y = le.transform(labels)

In [None]:
plt.figure(figsize=(9,6))
plt.scatter(X[y==0,0], X[y==0,1], c='g', s=30,
                 linewidths=0, edgecolors='none', label='QSO')
plt.scatter(X[y==1,0], X[y==1,1], c='b', s=30,
                 linewidths=0, edgecolors='none', label='stars')
plt.xlim(-0.5, 3.0)
plt.ylim(-0.3, 1.4)
plt.xlabel('$u - g$')
plt.ylabel('$g - r$')
plt.legend();

In [None]:
import seaborn.apionly as sns
g = sns.pairplot(data[['u-g','g-r','r-i','i-z','target']],hue='target')
g.fig.set_size_inches(9,6)
for i, j in zip(*np.triu_indices_from(g.axes, 1)):
    g.axes[i, j].set_visible(False)

In [None]:
# scale to have zero mean and unit standard deviation
X_scaled = StandardScaler().fit_transform(X)

In [None]:
# perform PCA to enhnace result
pca3 = PCA(n_components=2)
X_trans = pca3.fit_transform(X_scaled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
clf = SVC(C=0.1, kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print "%f"%(np.sum(y_test == y_pred) / float(len(y_test)))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

In [None]:
from sklearn.metrics import classification_report
print classification_report(y_pred, y_test)

In [None]:
h = 0.02
cm = plt.cm.RdBu
from matplotlib.colors import ListedColormap
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
if hasattr(clf, "decision_function"):
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8)

# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, s=200)
# and testing points
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
           alpha=0.6, s=200)

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

In [None]:
plt.figure(figsize=(9,6))
plt.scatter(X_pca[y==0,0], X[y==0,1], c='g', s=30,
                 linewidths=0, edgecolors='none', label='QSO')
plt.scatter(X_pca[y==1,0], X[y==1,1], c='b', s=30,
                 linewidths=0, edgecolors='none', label='stars')
plt.xlim(-0.5, 3.0)
plt.ylim(-0.3, 1.4)
plt.xlabel('$u - g$')
plt.ylabel('$g - r$')
plt.legend();