In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
!pip install tslearn
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from keras.optimizers import Adagrad

from keras.utils import to_categorical

from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics as metrics

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
nb_of_passband = 6
dataset_folder = "../input/PLAsTiCC-2018"
!ls {dataset_folder}

In [None]:
metadata = pd.read_csv(f"{dataset_folder}/training_set_metadata.csv")
metadata.head(5)

In [None]:
X_train = pd.read_csv("../input/completed-lc/completed_light_curves.csv")
X_train.head()

In [None]:
X_train.sort_values(by="mjd", inplace=True)

In [None]:
max_mjd = X_train.mjd.max()
max_mjd

In [None]:
step = 2
nb_of_passband=6

In [None]:
times = np.arange(0, max_mjd+1, step)
X_train = X_train[X_train.mjd.isin(times)]

In [None]:
def format_time_series(data):
    X = []
    X_err = []
    for i, row in metadata[["object_id", "target"]].iterrows():
#         print(row.object_id)
        ds = data[data.object_id==row.object_id]
        s0 = int(ds.shape[0] / nb_of_passband)
        x, x_err = [], []
        for p in range(nb_of_passband):
            x = np.append(x, ds[ds.passband==p].flux.values)
            x_err = np.append(x_err, ds[ds.passband==p].flux_err.values)
        X.append(x.reshape(s0, nb_of_passband))
        X_err.append(x.reshape(s0, nb_of_passband))
    X = np.array(X)
    X_err = np.array(X_err)
    return X, X_err

In [None]:
X_ts, X_ts_err = format_time_series(X_train)
print(X_ts.shape)
print(X_ts_err.shape)

In [None]:
y = metadata.target.values
print(y.shape)

In [None]:
X_ts /= (X_ts_err + 1) # +1 to avoid zero division
X_ts = np.nan_to_num(X_ts)
X_ts = TimeSeriesScalerMinMax().fit_transform(X_ts)
X_ts

In [None]:
y

In [None]:
num_classes = np.unique(y).shape[0]
num_classes

In [None]:
shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=X_ts.shape[0],
                                                       ts_sz=X_ts.shape[1],
                                                       n_classes=num_classes,
                                                       l=0.1,
                                                       r=2)
shapelet_sizes

In [None]:
def get_base_model():
    return ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                        optimizer=Adagrad(lr=.1),
                        weight_regularizer=.01,
                        batch_size=128,
                        max_iter=20,
                        verbose_level=1)

In [24]:
def compute_score(y_true, y_pred):
    return metrics.log_loss(y_true=y_true, y_pred=y_pred)

In [None]:
def get_best(X, y):
    skf = StratifiedKFold(n_splits=4)
    splits = skf.split(X, y)
    split = 0
    best_score = np.finfo(np.float64).max
    best_model = None
    splits = skf.split(X, y)
    model = get_base_model()
    for train_index, test_index in splits:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        score = compute_score(y_test, model.predict_proba(X_test))
        if score < best_score:
            best_score = score
            best_model = model
        print(f"Split-{split} score is:{score}")
        split += 1
    print(f"BEST SCORE: {best_score}")
    return best_model

In [None]:
# shp_clf = get_base_model().fit(X_ts, y)

In [None]:
shp_clf = get_best(X_ts, y)

In [None]:
pred_probas = shp_clf.predict_proba(X_ts)

In [None]:
preds = shp_clf.predict(X_ts)

categorical_accuracy: 0.3210 - categorical_crossentropy: 2.0561  
1964/1964 [==============================] - 1s 705us/step  
Split-1 score is:2.061876335903972