In [2]:
import numpy as np


#!pip install torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader 

In [3]:
%matplotlib notebook
import os
from collections import Counter, OrderedDict
import numpy as np
from operator import itemgetter
import matplotlib.pyplot as plt
from astropy.table import Table
from cesium.time_series import TimeSeries
from tqdm import tnrange, tqdm_notebook
import sklearn 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [4]:
import light_curve as licu


In [5]:
pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'y')])

# it also helps to have passbands associated with a color
pbcols = OrderedDict([(0,'blueviolet'), (1,'green'), (2,'red'),\
                      (3,'orange'), (4, 'black'), (5, 'brown')])

pbnames = list(pbmap.values())

In [6]:
import pandas as pd
metafilename = 'test_set_metadata.csv'
metadata = pd.read_csv(metafilename)
nobjects = len(metadata)
metadata

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv
0,13,34.453125,-5.229529,169.987075,-59.956185,1,0.3048,0.3193,0.0542,41.1123,0.019
1,14,33.398438,-4.331149,167.226341,-59.936551,1,,0.6323,0.0179,42.8774,0.018
2,17,348.529419,-61.755440,321.293980,-51.763351,1,,0.8297,0.0605,43.6000,0.016
3,23,34.804688,-5.829153,171.307861,-60.174401,1,,0.6533,0.1479,42.9640,0.023
4,34,351.321442,-64.198746,317.458993,-50.429931,1,0.4557,0.4617,0.0122,42.0540,0.023
...,...,...,...,...,...,...,...,...,...,...,...
3492885,130787966,67.500000,-23.806295,221.904509,-40.940434,0,,0.4493,0.9954,41.9836,0.036
3492886,130787971,98.789062,-32.974850,241.585054,-17.528223,0,,0.6729,0.0614,43.0419,0.083
3492887,130787974,133.945312,-21.542267,247.349359,15.069447,0,,0.1211,0.0093,38.7604,0.136
3492888,130788053,199.160156,-0.895283,316.152852,61.327851,0,,0.4287,0.2616,41.8625,0.028


In [7]:
lcfilename = 'test_set_batch2.csv'
lcdata = pd.read_csv(lcfilename)
#get unique obj_ids
obj_ids = list(set(lcdata['object_id'].tolist()))
lcdata

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,1000183,59583.2493,0,-4.635374,9.278234,0
1,1000183,59585.3547,4,6.479336,11.734409,0
2,1000183,59586.3595,4,-24.911951,13.357832,0
3,1000183,59590.3573,5,-67.180305,21.674242,0
4,1000183,59597.1711,5,78.801926,65.485069,0
...,...,...,...,...,...,...
44281690,13952424,60663.2247,3,-3.304427,2.799608,0
44281691,13952424,60664.2162,2,-6.012751,4.671264,0
44281692,13952424,60666.1202,1,-0.776305,2.129328,0
44281693,13952424,60666.2058,2,5.584902,3.180095,0


In [8]:
mjd_all = lcdata.groupby('object_id')['mjd'].apply(list).to_dict()
passband_all = lcdata.groupby('object_id')['passband'].apply(list).to_dict()
flux_all = lcdata.groupby('object_id')['flux'].apply(list).to_dict()
flux_err_all = lcdata.groupby('object_id')['flux_err'].apply(list).to_dict()
detected_all = lcdata.groupby('object_id')['detected'].apply(list).to_dict()

In [9]:
def make_obj_table(obj_id):
    d = {'mjd': mjd_all[obj_id], 'passband': passband_all[obj_id], 'flux': flux_all[obj_id], 
         'flux_err': flux_err_all[obj_id],'detected': detected_all[obj_id]}
    df = pd.DataFrame(data=d)
    return df

In [10]:
#six bands
def one_band_table(band, df):
    passband = df.loc[df['passband'] == band]
    return passband

In [11]:
def fit_one(obj_id):
    features = []
    df = make_obj_table(int(obj_id))
    for band in range(6):
        passband = one_band_table(band, df)
        time = np.asarray(passband['mjd'].tolist())
        idx = np.argsort(time)
        flux = np.asarray(passband['flux'].tolist())
        flux_error = np.asarray(passband['flux_err'].tolist())

        time = time[idx].astype(float)
        flux = flux[idx].astype(float)
        flux_error = flux_error[idx].astype(float)


        features = features+(extractor(time, flux, flux_error,
                                  fill_value=-999,
                                  sorted=True,
                                  check=False).tolist())
        
    return features

In [12]:
# do we want to add redshift???

def fit_all(obj_ids):
    # we can definitely split this between the cores
    objs_features = []
    for obj in obj_ids:
        features = fit_one(obj)
        combined = features
        objs_features.append(combined)
    return objs_features
        
        

In [13]:
extractor = licu.Extractor(licu.AndersonDarlingNormal(),
                           licu.InterPercentileRange(0.05),
                           licu.ReducedChi2(),
                           licu.StetsonK(),
                           licu.WeightedMean(),
                           licu.Duration(),
                           licu.OtsuSplit(),
                           licu.LinearFit())

In [14]:
# features saved, no need to rerun this for now

#all_features=fit_all(obj_ids)
#all_obj_ids = obj_ids

#save features to csv file
#import csv
#for line in all_features:
#    with open('features_batch_2.csv', 'a', newline='') as f:
#        writer = csv.writer(f)
#        writer.writerow(line)

In [22]:
datafile = open('features_batch_2.csv', 'r')
datareader = csv.reader(datafile, delimiter=',')
data = []
for row in datareader:
    data.append(row)    

In [23]:
dataset = torch.FloatTensor(all_features)

N_train = 100000
N_test = 100000
N_val = 100000

train_dataset = dataset[:N_train, :]
val_dataset  = dataset[N_train:N_train+N_val, :]
test_dataset  = dataset[N_train+N_val:N_train+N_val+N_test, :]