## Scketch an end to end pipeline to run classification models on new data and give predictions.
1. Get the light curve
    1. get light curve from DR2 set
        1. set parametes 
        1. form url using ZTF API format
        1. query URL
    1. get light curve from Xiao Dian's dataset
        1. set parameters
        1. form URL
        1. query URL
    1. plot flight curve per filter
1. Calculate features
    1. Take light curve
    1. calaculate all features
    1. return all features
1. Function run all the models with data
    1. Load all models
    1. pass feature data to all models and get prediction proba
    1. Return prediction proba for each classifier



In [1]:
#Load libraries
%matplotlib inline
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pickle # allows to save differnt trained models of the same classifier object
import seaborn as sns

In [None]:
def query_lightcurve_XD(SourceID): 
    """
    Download data for a single source from Xiao Dian's website. Source is identified using SourceID
    """
    url = 'http://variables.cn:88/seldataz.php?SourceID=' + str(SourceID)   
    try:
        lc = pd.read_csv(url, header='infer')
        lc['Type'] = variable_type
        lc['ID'] = labeled_data.ID[k]
    except:
        lc = pd.DataFrame()
    return lc

In [None]:
def query_lightcurve_DR(parameters): 

In [2]:
def weighted_mean(mag,mag_err):
    mag2 = (mag_err*mag_err) # mag err square
    mag2_inv = 1/mag2.values; # take inverse of the values
    w = pd.Series(mag2_inv) # covert it back to s series
    sw = w.sum() # sum of weights
    wmag = mag*w # multiply magnitude with weights
    wmean = wmag.sum()/sw # weighted mean
    return wmean

In [3]:
# welsh J, K statistics
def welsh_staton(mag_series,wmean):
    N = len(mag_series)
    d_i = N/(N-1)*(mag_series - wmean) # replace mean by weighted mean
    d_i1 = d_i.shift(periods=-1)
    d_i1.fillna(0, inplace = True)
    Pi = d_i*d_i1
    Pi_val = Pi.values
    Psign = np.sign(Pi_val)
    Jval = Psign*np.sqrt(np.abs(Pi_val))
    J = np.sum(Jval) 
    K1 = abs(d_i.values)/N
    K2 = np.sqrt(1/N*np.sum(d_i.values*d_i.values))
    K = np.sum(K1*K2)
    return J, K 

In [4]:
def calculate_features(lc):
    """
    Calculate features for a light curve passed as a dataframe.
    """
    if len(lc) >1:
#         lc.columns = ResultSet[0].keys()
#         RA.append(lc.RAdeg[1])
#         Dec.append(lc.DEdeg[1])
#         SourceID.append(lc.SourceID[1])
#         ID.append(lc.ID[1])
#         Type.append(lc.Type[1])
        
        dfg = lc.loc[lc["band"] == "g"]
        dfr = lc.loc[lc["band"] == "r"]
        if len(dfg) > 1:
            N = len(dfg)
    #         wmean_temp = np.average(dfg.mag, weights = dfg.e_mag*dfg.e_mag)
            wmean_temp = weighted_mean(dfg.mag,dfg.e_mag)
            K_temp, J_temp =  welsh_staton(dfg.mag, wmean_temp )
            g_mean.append(dfg.mag.mean())
            g_wmean.append(wmean_temp) 
            deviation = abs(dfg.mag - dfg.mag.median())
            g_MAD.append(deviation.median())
            g_IQR.append(dfg.mag.quantile(0.75) - dfg.mag.quantile(0.25))
            g_f60.append(dfg.mag.quantile(0.80) - dfg.mag.quantile(0.2))
            g_f70.append(dfg.mag.quantile(0.85) - dfg.mag.quantile(0.15))
            g_f80.append(dfg.mag.quantile(0.9) - dfg.mag.quantile(0.10))
            g_f90.append(dfg.mag.quantile(0.95) - dfg.mag.quantile(0.05))
            g_skew.append(dfg.mag.skew())
            g_kurtosis.append(dfg.mag.kurtosis())
            g_welsh_J.append(J_temp)
            g_welsh_K.append(K_temp)
        else:
            g_mean.append(np.NaN)
            g_wmean.append(np.NaN) 
            g_MAD.append(np.NaN)
            g_IQR.append(np.NaN)
            g_f60.append(np.NaN)
            g_f70.append(np.NaN)
            g_f80.append(np.NaN)
            g_f90.append(np.NaN)
            g_skew.append(np.NaN)
            g_kurtosis.append(np.NaN)
            g_welsh_J.append(np.NaN)
            g_welsh_K.append(np.NaN)
                
        if len(dfr) >1:
            N = len(dfr)
            wmean_temp = weighted_mean(dfr.mag,dfr.e_mag)
    #         wmean_temp = np.average(dfr.mag, weights = dfr.e_mag*dfr.e_mag)
            K_temp, J_temp =  welsh_staton(dfr.mag, wmean_temp )
            r_mean.append(dfr.mag.mean())
            r_wmean.append(wmean_temp) 
            deviation = abs(dfr.mag - dfr.mag.median())
            r_MAD.append(deviation.median())
            r_IQR.append(dfr.mag.quantile(0.75) - dfr.mag.quantile(0.25))
            r_f60.append(dfr.mag.quantile(0.80) - dfr.mag.quantile(0.2))
            r_f70.append(dfr.mag.quantile(0.85) - dfr.mag.quantile(0.15))
            r_f80.append(dfr.mag.quantile(0.9) - dfr.mag.quantile(0.10))
            r_f90.append(dfr.mag.quantile(0.95) - dfr.mag.quantile(0.05))
            r_skew.append(dfr.mag.skew())
            r_kurtosis.append(dfr.mag.kurtosis())
            r_welsh_J.append(J_temp)
            r_welsh_K.append(K_temp)
        else:
            r_mean.append(np.NaN)
            r_wmean.append(np.NaN) 
            r_MAD.append(np.NaN)
            r_IQR.append(np.NaN)
            r_f60.append(np.NaN)
            r_f70.append(np.NaN)
            r_f80.append(np.NaN)
            r_f90.append(np.NaN)
            r_skew.append(np.NaN)
            r_kurtosis.append(np.NaN)
            r_welsh_J.append(np.NaN)
            r_welsh_K.append(np.NaN)

    else:
        RA.append(np.NaN)
        Dec.append(np.NaN)
        SourceID.append(np.NaN) 
        ID.append(np.NaN)
        Type.append(np.NaN)
        g_mean.append(np.NaN)
        g_wmean.append(np.NaN) 
        g_MAD.append(np.NaN)
        g_IQR.append(np.NaN)
        g_f60.append(np.NaN)
        g_f70.append(np.NaN)
        g_f80.append(np.NaN)
        g_f90.append(np.NaN)
        g_skew.append(np.NaN)
        g_kurtosis.append(np.NaN)
        g_welsh_J.append(np.NaN)
        g_welsh_K.append(np.NaN)
        r_mean.append(np.NaN)
        r_wmean.append(np.NaN) 
        r_MAD.append(np.NaN)
        r_IQR.append(np.NaN) 
        r_f60.append(np.NaN)
        r_f70.append(np.NaN)
        r_f80.append(np.NaN)
        r_f90.append(np.NaN)
        r_skew.append(np.NaN)
        r_kurtosis.append(np.NaN)
        r_welsh_J.append(np.NaN)
        r_welsh_K.append(np.NaN)
        
    # del features
    features = pd.DataFrame()
    N = 1
#     features['sourceid'] = SourceID[0:N]
#     features['ID'] = ID[0:N]
    # features['ID'] = labeled_data_sampled.ID.values[0:N]
#     features['RAdeg'] = RA[0:N]
#     features['DEdeg'] = Dec[0:N]

    # g filter data
    features['g_mean'] = g_mean[0:N]
    features['g_wmean'] = g_wmean[0:N]
    features['g_MAD'] = g_MAD[0:N]
    features['g_IQR'] = g_IQR[0:N]
    features['g_f60'] = g_f60[0:N]
    features['g_f70'] = g_f70[0:N]
    features['g_f80'] = g_f80[0:N]
    features['g_f90'] = g_f90[0:N]
    features['g_skew'] = g_skew[0:N]
    features['g_kurtosis'] = g_kurtosis[0:N]
    features['g_welsh_J'] = g_welsh_J[0:N]
    features['g_welsh_K'] = g_welsh_K[0:N]

    # r filter data
    features['r_mean'] = r_mean[0:N]
    features['r_wmean'] = r_wmean[0:N]
    features['r_MAD'] = r_MAD[0:N]
    features['r_IQR'] = r_IQR[0:N]
    features['r_f60'] = r_f60[0:N]
    features['r_f70'] = r_f70[0:N]
    features['r_f80'] = r_f80[0:N]
    features['r_f90'] = r_f90[0:N]
    features['r_skew'] = r_skew[0:N]
    features['r_kurtosis'] = r_kurtosis[0:N]
    features['r_welsh_J'] = r_welsh_J[0:N]
    features['r_welsh_K'] = r_welsh_K[0:N]
    features['Type'] = Type[0:N]
    return features


In [5]:
def prediction_probabilty(features):
    """
    Predict probability for each of the 9 variable types using pre calculated features.
    """
    prob={}
    Xf = features.iloc[1] # get feature row
    X = Xf.values # get value
    for variable_type in label:
        for name, clf in zip(clf_names, classifiers):
            filename = name+'_'+variable_type+'.pkl'
            clf = pickle.load(open(filename, 'rb'))
            prob[variable_type] = clf.predict_proba(1)
    return prob
            