In [221]:
import numpy as np
from keras.optimizers import Adagrad
import matplotlib.pyplot as plt

from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict

from sklearn.model_selection import cross_validate

In [182]:
nb_of_passband = 6
dataset_folder = "../all"
!ls {dataset_folder}

data_note.pdf	       test_set_metadata.csv  training_set_metadata.csv
sample_submission.csv  test_set_sample.csv
test_set.csv	       training_set.csv


In [22]:
train = pd.read_csv(f"{dataset_folder}/training_set.csv")
train.head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [23]:
metadata = pd.read_csv(f"{dataset_folder}/training_set_metadata.csv")
metadata.head(5)

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [37]:
train.mjd = train.mjd.astype(np.int32)
train.mjd = train.mjd - train.mjd.min()
train.head(10)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,170,2,-544.810303,3.622952,1
1,615,170,1,-816.434326,5.55337,1
2,615,170,3,-471.385529,3.801213,1
3,615,170,4,-388.984985,11.395031,1
4,615,172,2,-681.858887,4.041204,1
5,615,172,1,-1061.457031,6.472994,1
6,615,172,3,-524.95459,3.552751,1
7,615,172,4,-393.480225,3.599346,1
8,615,172,5,-355.88678,10.421921,1
9,615,187,2,-548.01355,3.462291,1


In [36]:
max_mjd = train.mjd.max()
max_mjd

1094

In [57]:
times = train.groupby(["object_id", "passband"]).apply(lambda row: row.mjd.values).reset_index().rename(columns={0: 'times'})
times.head(5)

Unnamed: 0,object_id,passband,times
0,615,0,"[239, 240, 241, 242, 243, 271, 294, 295, 296, ..."
1,615,1,"[170, 172, 187, 190, 199, 202, 217, 220, 227, ..."
2,615,2,"[170, 172, 187, 190, 199, 202, 217, 220, 227, ..."
3,615,3,"[170, 172, 187, 190, 199, 202, 217, 220, 227, ..."
4,615,4,"[170, 172, 187, 190, 199, 202, 217, 220, 227, ..."


In [122]:
features_set = pd.read_csv("../data-transformation/feature_set.csv", index_col=[0, 1])
features_set.head(10)

Unnamed: 0,Unnamed: 1,amplitude,maximum,median,med_err,minimum,avg_err,mean,weighted_average
615,0,121.048016,125.182808,-10.015225,3.86638,-116.913223,3.823448,-3.254554,-17.061118
615,1,880.533203,660.626343,-488.057968,4.980931,-1100.440063,5.029014,-385.699911,-212.397193
615,2,646.921722,611.984558,-265.686005,3.387887,-681.858887,3.380299,-134.146566,-102.220639
615,3,488.190826,445.737061,-162.170945,3.388196,-530.644592,3.340116,-121.103501,-101.206639
615,4,402.069122,381.953735,-103.541367,3.625775,-422.184509,3.798761,-55.954592,-54.744845
615,5,400.501618,378.188141,-85.524307,7.310321,-422.815094,7.636021,-47.449847,-59.688379
713,0,14.622504,14.509829,-3.096805,2.273303,-14.735178,2.3314,-2.720398,-3.500958
713,1,10.422385,9.129021,-0.561735,1.115409,-11.715749,1.416982,-1.019804,-1.322397
713,2,10.29848,10.529041,-0.117976,1.062866,-10.067919,1.192936,-0.794238,-1.030469
713,3,11.862454,11.330316,-0.073896,1.629944,-12.394593,1.640155,-0.986966,-1.382941


In [236]:
def fill_missing_values(times, features_set, max_times=max_mjd, step=1):
    timeset = set(np.arange(0, max_times, step))
    X = []
    for i, row in times.iterrows():
        missing = timeset - set(row.times)
        features = features_set.loc[row.object_id, row.passband]
        for t in missing:
            X.append([row.object_id, t, row.passband, features["mean"], features["avg_err"], 1])
    return pd.DataFrame(columns=["object_id", "mjd", "passband", "flux", "flux_err", "detected"], data=X)

In [None]:
missing_data = fill_missing_values(times, features_set, max_mjd)
missing_data.head()

In [170]:
X_train = train.append(missing_data, ignore_index=True)

In [171]:
X_train.sort_values(by="mjd", inplace=True)

In [211]:
def format_time_series():
    X = []
    X_err = []
    for i, row in metadata[["object_id", "target"]][:1].iterrows():
        print(row.object_id)
        ds = X_train[X_train.object_id==row.object_id]
        x, x_err = [], []
        for p in range(nb_of_passband):
            x = np.append(x, ds[ds.passband==p].flux.values)
            x_err = np.append(x_err, ds[ds.passband==p].flux_err.values)
        X.append(x.reshape(max_mjd, nb_of_passband))
        X_err.append(x.reshape(max_mjd, nb_of_passband))
    X = np.array(X)
    X_err = np.array(X_err)
    return X, X_err

In [214]:
X_ts, X_ts_err = format_time_series()
y = metadata.target.values
print(X_ts.shape)
print(X_ts_err.shape)
print(y.shape)

615
(1, 1094, 6)
(1, 1094, 6)
(7848,)


In [213]:
X_ts /= (X_ts_err + 1) # +1 to avoid zero division

In [210]:
X_ts = TimeSeriesScalerMinMax().fit_transform(X_ts)
X_ts

array([[[1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        ],
        [1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        ],
        [1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        ],
        ...,
        [0.26255672, 0.13936003, 0.1691817 , 0.39488098, 0.22270791,
         0.26022717],
        [0.26255672, 0.13936003, 0.1691817 , 0.39488098, 0.22270791,
         0.26022717],
        [0.26255672, 0.13936003, 0.1691817 , 0.39488098, 0.22270791,
         0.26022717]]])

In [218]:
y

array([92, 88, 42, ..., 16, 65,  6])

In [220]:
shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=X_ts.shape[0],
                                                       ts_sz=X_ts.shape[1],
                                                       n_classes=len(set(y)),
                                                       l=0.1,
                                                       r=2)
shapelet_sizes

{109: 4, 218: 4}

In [219]:
shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                        optimizer=Adagrad(lr=.1),
                        weight_regularizer=.01,
                        max_iter=50,
                        verbose_level=0)

In [235]:
def get_best(model, X, y):
    cv_result = cross_validate(model, X, y, cv=10,return_estimator=True, scoring="neg_log_loss")
    best_index = np.argmax(cv_result["test_score"])
    print(f"Log Error: {-cv_result['test_score'][best_index]}")
    return cv_result["estimator"][best_index]

In [234]:
shp_clf = get_best(shp_clf, X_ts, y)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.