# 3. hands-on session: **Classification problem: from *Data* to *Inference***

## **Contents**

1. Preprocess the data
1. Select features & reduce dimensions
1. Find best hyperparameters
1. Closs-validate
1. Compare classifiers
1. Combine classifiers
1. Predict

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
!pip install corner
import corner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## **Our dataset**

SDSS data of point sources: stars vs Quasi-stellar objects

<img src="https://cdn.mos.cms.futurecdn.net/HgaCHZDNppE6e52yeDACo6-970-80.jpg.webp" height=200>

<img src="https://earthsky.org/upl/2021/01/supermassive-black-hole-artist-e1610556964639.jpg" height=200 align=right>



In [19]:
!wget -c "https://drive.google.com/uc?id=1IoQfGFo13ZP2wTyp-xvzQvguPYhE8TWB" -O "sdss_photo.csv"

--2022-10-11 18:04:49--  https://drive.google.com/uc?id=1IoQfGFo13ZP2wTyp-xvzQvguPYhE8TWB
Resolving drive.google.com (drive.google.com)... 173.194.218.102, 173.194.218.113, 173.194.218.139, ...
Connecting to drive.google.com (drive.google.com)|173.194.218.102|:443... connected.
HTTP request sent, awaiting response... 416 Requested range not satisfiable

    The file is already fully retrieved; nothing to do.



In [20]:
data = pd.read_csv("sdss_photo.csv")

## **Data preprocessing**

### What to do with the data:
- look at the data
- check data quality
- understand the data
- think of selection biases

In [21]:
data

Unnamed: 0,u,g,r,i,z,target
0,19.240999,17.525999,16.840000,16.613001,16.492001,star
1,19.732000,18.339001,17.767000,17.580000,17.481001,star
2,16.120001,15.232000,14.933000,14.845000,14.827000,star
3,18.995001,18.150999,17.877001,17.747000,17.766001,star
4,19.032000,18.112000,17.848000,17.709999,17.712000,star
...,...,...,...,...,...,...
862,19.247000,19.047001,18.663000,18.627001,18.671000,QSO
863,26.433001,21.429001,20.261000,20.108000,20.073000,QSO
864,20.535999,20.200001,20.292999,19.910000,19.726000,QSO
865,20.627001,20.583000,20.270000,20.229000,20.190001,QSO


In [22]:
data.describe().round(2)

Unnamed: 0,u,g,r,i,z
count,867.0,867.0,867.0,867.0,867.0
mean,19.29,18.17,17.72,17.54,17.45
std,1.39,1.33,1.35,1.37,1.38
min,15.67,14.36,14.12,13.95,13.89
25%,18.5,17.29,16.74,16.54,16.45
50%,19.33,18.33,17.88,17.69,17.62
75%,20.02,19.09,18.76,18.61,18.52
max,26.43,25.17,21.14,20.72,20.91


In [23]:
sum(data.target== "star"), sum(data.target == "QSO")

(655, 212)

#### task 1: **create `X` and `y`**

```python
data[["u","g","r","i","z"]] -> X
data.target -> y
"QSO" -> 1
"star" -> 0
```


In [25]:
filters = ["u","g","r","i","z"]
X = data[filters]
X

Unnamed: 0,u,g,r,i,z
0,19.240999,17.525999,16.840000,16.613001,16.492001
1,19.732000,18.339001,17.767000,17.580000,17.481001
2,16.120001,15.232000,14.933000,14.845000,14.827000
3,18.995001,18.150999,17.877001,17.747000,17.766001
4,19.032000,18.112000,17.848000,17.709999,17.712000
...,...,...,...,...,...
862,19.247000,19.047001,18.663000,18.627001,18.671000
863,26.433001,21.429001,20.261000,20.108000,20.073000
864,20.535999,20.200001,20.292999,19.910000,19.726000
865,20.627001,20.583000,20.270000,20.229000,20.190001


In [33]:
y = np.array(data["target"] == "star").astype(int)
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [35]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# le.fit(data.target)
# y = le.transform(data.target)
y = le.fit_transform(data.target)
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
cols = data.columns
cols = cols[["mag" in i for i in cols]]
X = data[cols]
X.columns = [i.split("_")[1] for i in X.columns]

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(data["target"])
y = le.transform(data["target"])

In [None]:
X

#### task 2: **plot the data**

In [None]:
X_QSO = X[y == 0]
X_star = X[y == 1]

In [None]:
fig = corner.corner(X_QSO, color="C0")
corner.corner(X_star, fig=fig, color="C1");

#### task 3: **classify with SVC & test score**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

model = SVC(kernel="linear")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
sum(y_pred == y_test) / len(y_pred)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred, digits=3))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=3))

#### task 4: **rescale the data -> `X_scaled` & test score**

In [None]:
X_scaled = X.copy()
for col in X.columns:
    X_scaled[col] = (X[col] - np.mean(X[col])) / np.std(X[col])

In [None]:
X_scaled.describe().round(2)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(data=X_scaled, columns=X.columns)
X_scaled.describe().round(2)

In [None]:
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(),
                      SVC(kernel="linear"))

model.fit(X_train, y_train)

model.score(X_train, y_train)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(),
                      SVC(kernel="linear"))

model.fit(X_train, y_train)

model.score(X_train, y_train)

## **Feature selection & dimensionality reduction**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
clf = ExtraTreesClassifier(random_state=42).fit(X,y)
clf.feature_importances_

In [None]:
plt.bar(np.arange(5), clf.feature_importances_, 0.5)
plt.xticks(np.arange(5), X.columns);

#### task 5: **calculate spectral indices -> `X_new` & test importance**

In [None]:
X_new = X.copy()
X_new["new"] = np.ones(len(X_new.u))
X_new

In [None]:
X_new = X.copy()
X_new["u-g"] = X.u - X.g
X_new["u-r"] = X.u - X.r
X_new["u-z"] = X.u - X.z
X_new["i-z"] = X.i - X.z

X_new

In [None]:
clf = ExtraTreesClassifier(random_state=42).fit(X_new,y)
plt.bar(np.arange(len(X_new.columns)), clf.feature_importances_, 0.5)
plt.xticks(np.arange(len(X_new.columns)), X_new.columns);

In [None]:
X_new = X.copy()
X_new["u-g"] = X.u - X.g
X_new["u-r"] = X.u - X.r
X_new["u-z"] = X.u - X.z
X_new["i-z"] = X.i - X.z

clf = ExtraTreesClassifier(random_state=42).fit(X_new,y)
plt.bar(np.arange(len(X_new.columns)), clf.feature_importances_, 0.5)
plt.xticks(np.arange(len(X_new.columns)), X_new.columns);

#### task 6: **test score for *u-r* spectral index**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new[["u-r"]], y, 
                                                    stratify=y, 
                                                    random_state=42)

model.fit(X_train, y_train)

model.score(X_test, y_test)

#### task 7: **create dummy column & test importance**

In [None]:
X_new3 = X.copy()

X_new3["dummy"] = np.random.normal(0, 1, size=len(X.r))
X_new3["dummy2"] = np.ones_like(X.r)

In [None]:
clf = ExtraTreesClassifier().fit(X_new3,y)
plt.bar(np.arange(len(X_new3.columns)), clf.feature_importances_, 0.5)
plt.xticks(np.arange(len(X_new3.columns)), X_new3.columns);

In [None]:
X_new3 = X.copy()

X_new3["dummy"] = np.random.normal(0, 1, size=len(X.r))
X_new3["dummy2"] = np.ones_like(X.r)

clf = ExtraTreesClassifier().fit(X_new3,y)
plt.bar(np.arange(len(X_new3.columns)), clf.feature_importances_, 0.5)
plt.xticks(np.arange(len(X_new3.columns)), X_new3.columns);

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new3[["dummy"]], y, 
                                                    stratify=y, 
                                                    random_state=0)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
sum(data["target"] == "star") / len(y), sum(data["target"] == "QSO") / len(y)

### **Principal component analysis**

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_n = pca.fit_transform(X)

In [None]:
plt.plot(X.u, X.g, ".")
plt.plot(X_n[:,0], X_n[:,1], ".")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_n, y, 
                                                    stratify=y, 
                                                    random_state=42)

model.fit(X_train, y_train)

model.score(X_test, y_test)

## **Tune hyperparameters**

In [None]:
SVC?

#### task 8: **find SVC hyperparameters with best test score**

In [None]:
def classify(X, y, classifier):
    model = make_pipeline(StandardScaler(),
                          classifier)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.score(X_test, y_test).round(3))

In [None]:
classify(X, y, SVC(kernel="linear"))

In [None]:
classify(X, y, SVC(kernel="linear", C=10))

In [None]:
classify(X, y, SVC(kernel="linear", C=0.1))

In [None]:
classify(X, y, SVC(kernel="poly", degree=1))

In [None]:
classify(X, y, SVC(kernel="rbf", C=100))

## **Crossvalidate**

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
model = SVC(kernel="rbf", C=1000)

res = cross_validate(model, X, y, cv=10)

np.mean(res["test_score"]), np.std(res["test_score"])

### **Grid-search + crossvalidation**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
clf = SVC()
params = {"C" : [0.01, 0.1, 1, 10, 100],
          "kernel" : ["linear", "poly", "rbf"]}#,
          #"gamma" : ["scale", "auto"]}
model = GridSearchCV(clf, params, cv=5, n_jobs=8)
model.fit(X, y)

In [None]:
model.cv_results_

In [None]:
params, score = model.cv_results_["params"], model.cv_results_["mean_test_score"]

indices = np.argsort(score)

for i in indices:
    print(params[i], score[i].round(3))

In [None]:
model.best_estimator_

## **Compare classifiers**

In [None]:
from sklearn.neural_network import MLPClassifier # multi-layer perceptron classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifiers = [MLPClassifier(max_iter=1000),
               GaussianNB(),
               DecisionTreeClassifier(),
               KNeighborsClassifier(),
               SVC(kernel="rbf", C=10)]

for classifier in classifiers:
    classify(X, y, classifier)

In [None]:
clf = MLPClassifier(max_iter=1000)
params = {"hidden_layer_sizes" : [5, 10, 50, 100],
          "activation" : ["identity", "logistic", "tanh", "relu"],
          "solver" : ["sgd", "adam"]}
model = GridSearchCV(clf, params, cv=5, n_jobs=8)
model.fit(X, y)

In [None]:
model.best_estimator_, model.best_score_

In [None]:
%time MLPClassifier(hidden_layer_sizes=10, max_iter=1000, solver='sgd').fit(X_train, y_train).score(X_test, y_test)

In [None]:
%time SVC(C=10).fit(X_train, y_train).score(X_test, y_test)

## **Ensemble methods**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier()

res = cross_validate(model, X, y, cv=10)
np.mean(res["test_score"]), np.std(res["test_score"])

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
classifiers = [("MLP", MLPClassifier(max_iter=1000, random_state=42)),
               ("Bayes", GaussianNB()),
               ("RFC", RandomForestClassifier()),
               ("KNN", KNeighborsClassifier()),
               ("SVC", SVC(C=10))]

model = StackingClassifier(classifiers)

res = cross_validate(model, X, y, cv=10)
np.mean(res["test_score"]), np.std(res["test_score"])

In [None]:
classifiers = [("MLP", MLPClassifier(max_iter=1000, random_state=42)),
               ("Bayes", GaussianNB()),
               ("DTC", DecisionTreeClassifier()),
               ("KNN", KNeighborsClassifier()),
               ("SVC", SVC(C=10))]

# train the model
clf = StackingClassifier(classifiers)

model = make_pipeline(StandardScaler(),
                      clf)

res = cross_validate(model, X, y, cv=10)
np.mean(res["test_score"]), np.std(res["test_score"])

## **Conclusion**

In [None]:
classifiers = [("MLP", MLPClassifier(max_iter=1000, random_state=42)),
               ("Bayes", GaussianNB()),
               ("DTC", DecisionTreeClassifier()),
               ("KNN", KNeighborsClassifier()),
               ("SVC", SVC(C=10))]

# train the model
clf = StackingClassifier(classifiers)

model = make_pipeline(StandardScaler(),
                      clf)

model.fit(X_new[["u-g"]], y)

In [None]:
u = 15.914
g = 15.500

pred = model.predict(np.array([u-g]).reshape(-1,1))

le.inverse_transform(pred)