In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
training_set_metadata = pd.read_csv("all/training_set_metadata.csv")
training_set_metadata.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [4]:
training_set = pd.read_csv("all/training_set.csv")
training_set.describe()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
count,1421705.0,1421705.0,1421705.0,1421705.0,1421705.0,1421705.0
mean,33926080.0,60179.21,2.783108,24.68855,25.48932,0.1666168
std,42584390.0,309.2379,1.703946,3684.506,3814.492,0.3726336
min,615.0,59580.03,0.0,-1149388.0,0.463753,0.0
25%,184340.0,59899.05,1.0,-2.789418,2.11867,0.0
50%,4548783.0,60193.31,3.0,0.733199,4.708105,0.0
75%,67962800.0,60487.22,4.0,8.830932,12.97371,0.0
max,130779800.0,60674.36,5.0,2432809.0,2234069.0,1.0


In [6]:
targets = training_set_metadata.target.unique()
n_classes = targets.size # number of known classes
n_classes

14

In [2]:
class TWED:
    def __init__(self, lamda, stiffness, p = 1):
        self.lamda = lamda
        self.stiffness = stiffness
        self.p = p
    
    def lp_norm(self, a, b):
        c = np.abs(a - b)
        return np.power(c, self.p).sum() ** (1/self.p)
    
    def similarity(self, A, B, time_stamp_A, time_stamp_B):
        seq_A = np.append([0], A)
        seq_B = np.append([0], B)
        ts_A = np.append([0], time_stamp_A)
        ts_B = np.append([0], time_stamp_B)
        n = len(ts_A)
        m = len(ts_B)
        twed = np.full((n, m), np.inf)
        twed[0, 0] = 0
        for i in range(1, n):
            for j in range(1, m):
                del_A = (twed[i-1, j] + 
                         self.lp_norm(seq_A[i-1], seq_A[i]) + 
                         self.stiffness * self.lp_norm(ts_A[i], ts_A[i-1]) + 
                         self.lamda)
                del_B = (twed[i, j-1] + 
                         self.lp_norm(seq_B[j], seq_B[j-1]) + 
                         self.stiffness * self.lp_norm(ts_B[j], ts_B[j-1]) + 
                         self.lamda)
                match = (twed[i-1, j-1] +
                         self.lp_norm(seq_A[i], seq_B[j]) +
                         self.stiffness * self.lp_norm(ts_A[i], ts_B[j]) + 
                         self.lp_norm(seq_A[i-1], seq_B[j-1]) + 
                         self.stiffness * self.lp_norm(ts_A[i-1], ts_B[j-1]))
                
                twed[i, j] = np.min([del_A, del_B, match])
        return twed[n-1, m-1]

In [64]:
def preprocessing(aDataFrame):
    object_ids = aDataFrame.object_id.unique()
    df = pd.concat([aDataFrame[aDataFrame.object_id == oid] for oid in object_ids], ignore_index=True)
    flux_mean = df.flux.mean()
    flux_std = df.flux.std()
    normalized_df = df.copy()
    normalized_df["flux"] = df.flux / df.flux_err
    normalized_df["flux"] = (df.flux - flux_mean) / flux_std
    return normalized_df

In [65]:
normalized_df = preprocessing(training_set)

In [69]:
sample_targets = targets[:4]
oids = pd.concat(
    [training_set_metadata[training_set_metadata.target==t][["object_id", "target"]][:5] for t in sample_targets]
)
sample_data = pd.concat([normalized_df[normalized_df.object_id == oid] for oid in oids.object_id], ignore_index=True)
sample_data.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-0.154566,3.622952,1
1,615,59750.4306,1,-0.228286,5.55337,1
2,615,59750.4383,3,-0.134638,3.801213,1
3,615,59750.445,4,-0.112274,11.395031,1
4,615,59752.407,2,-0.191762,4.041204,1


In [73]:
oids.head()

Unnamed: 0,object_id,target
0,615,92
69,12695,92
150,26161,92
156,26783,92
162,28391,92


In [50]:
twed = TWED(lamda=1, stiffness=1)

In [78]:
def twed_to_a_class(twed, aSeries, class_df):
    oids = class_df.object_id.unique()
    tweds = np.array([twed.similarity(
                aSeries, 
                class_df[class_df==oid].flux.values, 
                aSeries.index, 
                class_df[class_df==oid].mjd.values) for oid in oids])
    return tweds.mean()

In [80]:
aSeries = normalized_df[normalized_df.object_id==615].object_id
aSeries.index = normalized_df[normalized_df.object_id==615].mjd.values
oid_in_class = oids[oids.target==92].object_id.values[:5]
class_df = pd.concat([normalized_df[normalized_df.object_id==i] for i in oid_in_class])
twed_to_a_class(twed, aSeries, class_df)

nan