## Preprocess data/ build features for RNN/LSTM/GRU

In [499]:
import os
import copy 
import numpy as np
import torch.nn as nn

# import pandas for data processing
import pandas as pd

# import lib for plot
import matplotlib.pyplot as plt

## Read train data to pandas framework

In [500]:
# define path to data files
data_path = "/Users/georgyguryev/Documents/repos/6.867/PLAsTiCC-Astronomical-Classification/data/"

# define train time series and static data
train_ts_fname = "training_set.csv"
train_md_fname = "training_set_metadata.csv"

# read csv files 
train_time_series = pd.read_csv(data_path + train_ts_fname)
train_meta_data   = pd.read_csv(data_path + train_md_fname)

## Generate features

In [442]:
train_time_series.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [443]:
N_pass = 6
N_obj  = train_time_series['object_id'].unique().shape[0]
N_raw  = train_time_series.shape[0]

# get maximum length of time series
max_Length = train_time_series['object_id'].value_counts().max();

# form a list of unique objects
Obj_list = train_time_series['object_id'].unique()

print(Obj_list)


def generate_features(data, n_passbd = 6):
   
    # define type of aggregation parameters
    aggs = {'flux':['mean', 'std'],
           'flux_err': ['mean','std']
           }
    
    # aggregate data for statistics
    obj_stat = data.groupby(['object_id', 'passband']).agg(aggs)

    return obj_stat
    
    
# call statistics generator
obj_stat = generate_features(train_time_series)

# convert DataFrame to np array
stat_data = obj_stat.values

# extract flux data [mean, std]
flux_mean = stat_data[:,0].T.reshape((N_obj,N_pass))
flux_std  = stat_data[:,1].T.reshape((N_obj,N_pass))

# extract flux error [mean, std]
flux_mean_err = stat_data[:,2].T.reshape((N_obj,N_pass))
flux_std_err  = stat_data[:,3].T.reshape((N_obj,N_pass))

# generate column names
flux_mean_col = ['flux' + '_' + str(band) for band in range(N_pass)]
flux_std_col  = ['flux_std' + '_' + str(band) for band in range(N_pass)]

flux_err_mean_col = ['flux_err_' + '_' + str(band) for band in range(N_pass)]
flux_err_std_col  = ['flux_err_std' + '_' + str(band) for band in range(N_pass)]

# create datastructures 
df_flux_mean = pd.DataFrame(flux_mean)
df_flux_std  = pd.DataFrame(flux_std)

df_flux_err_mean = pd.DataFrame(flux_mean_err)
df_flux_err_std  = pd.DataFrame(flux_std_err)

# label data structures
df_flux_mean.columns = flux_mean_col
df_flux_std.columns  = flux_std_col
df_flux_err_mean.columns = flux_err_mean_col
df_flux_err_std.columns  = flux_err_std_col

# merge 
frames = pd.concat([df_flux_mean, df_flux_std, df_flux_err_mean, df_flux_err_std], axis = 1)


[      615       713       730 ... 130762946 130772921 130779836]


## Produce time series input for RNN 


In [498]:
def create_series_features(raw_train_series, stat_train_series):
    
    # allocate memory for new data structure
    series_features    = copy.deepcopy(raw_train_series)
    
    # allocate memory for output feature vector
    resulting_features = np.zeros((N_obj * max_Length, 4* N_pass + 12))
    
#     print (resulting_features)
    
    # prepare label list for reshuffling
    labels = ["object_id", "mjd", "detected", "passband", "flux", "flux_err"]
    
    series_features = series_features[labels]
    
    for i in range(round(N_obj/10)):
        
        
        # select all observations for a current object
        current_observ = raw_train_series.loc[raw_train_series['object_id'] == Obj_list[i]]
        
        resulting_features[(i-1) * ]
        
        
        
        curr_obs_length = current_observ.shape[0]
#         print(current_observ)
        
        
#     # iterate over time series data and construct input features for RNN
#     for j in range(0,N_raw):
        
#         current_raw_vetor = np.reshape(series_features.iloc[i].values, (6,1))
        
#         # get current object id
#         current_obj_id = current_raw_vetor[0]
        
#         # find the order of current object in the list of unique indices
#         idx = np.where(Obj_list == current_obj_id)
        
#         # get current observation bandwidth
#         current_passband = int(current_raw_vetor[3])
                
#         # get time series statistics for a given vector
#         current_flux_features = stat_train_series.iloc[idx].values.T
        
#         # update flux value for the current bandwidth
#         current_flux_features[current_passband] = current_raw_vetor[4]
                
#         # update flux error for the current bandwidth
#         current_flux_features[current_passband + 2 * N_pass] = current_raw_vetor[5]

#         # concatenate static and dynamic features
#         resulting_features[i,:] = np.concatenate((current_raw_vetor[0:3].T, current_flux_features.T), axis=1)
        
#         if (i % 100000 == 99999):
#             print(i / N_raw * 100, "% done")
        
    # return resulting features
    return resulting_features
        

X_train = create_series_features(train_time_series, train_static_data)
X_train.shape

(2762496, 36)

## dkf;ksd;flks;d

In [457]:
# extract target classes
target = train_meta_data['target'].values.copy()

# define weight vector for unique label classes
weights = {x:1 for x in np.unique(target)}

# remove nans/nulls for galactic redshift/dist
train_mask = train_meta_data['distmod'].isnull().values

# 
train_meta_data['distmod'].fillna(0, inplace=True)

# reweigth galaxies
weights[15] = 2
weights[64] = 2

train_meta_data.head()

# remove target and hostgal_specz columns from metadata
remove_cols = ['hostgal_specz', 'target', 'object_id']
for c in remove_cols:
    if c in train_meta_data.columns:
        del train_meta_data[c]
#     if c in test_meta.columns:
#         del test_meta[c]



# call function        
# X_train = create_series_features(train_time_series, frames, train_meta_data)
    
# X_train_df = pd.DataFrame(X_train)


In [463]:
train_static_data = pd.concat([frames,train_meta_data], sort='False', axis=1)
train_static_data.head()

# call function        
# X_train = create_series_features(train_time_series, train_static_data)

Unnamed: 0,flux_0,flux_1,flux_2,flux_3,flux_4,flux_5,flux_std_0,flux_std_1,flux_std_2,flux_std_3,...,flux_err_std_5,ra,decl,gal_l,gal_b,ddf,hostgal_photoz,hostgal_photoz_err,distmod,mwebv
0,-3.254554,-385.699911,-134.146566,-121.103501,-55.954592,-47.449847,83.944735,601.787302,455.121346,335.425053,...,1.317882,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,0.017
1,-2.720398,-1.019804,-0.794238,-0.986966,-0.900262,-1.794175,7.113509,5.712334,5.770738,6.450413,...,1.187223,53.085938,-27.784405,223.525509,-54.460748,1,1.6267,0.2552,45.4063,0.007
2,-0.04808,0.141057,2.40087,3.236164,4.308728,4.539396,1.828872,1.807229,5.559483,8.191987,...,1.309769,33.574219,-6.579593,170.455585,-61.548219,1,0.2262,0.0157,40.2561,0.021
3,1.797523,5.717394,9.711532,14.412924,13.134436,10.746138,4.374445,25.964659,31.957997,34.967698,...,6.924616,0.189873,-45.586655,328.254458,-68.969298,1,0.2813,1.1523,40.7951,0.007
4,0.660948,4.634637,10.243968,11.086555,9.906102,6.896742,2.360084,8.107525,21.319854,26.270649,...,1.439684,352.711273,-63.823658,316.922299,-51.059403,1,0.2415,0.0176,40.4166,0.024


# Create and Train RNN

### Define RNN class with pytorch

In [8]:
# define a simple RNN class ( to be replaced in future implementations)
class RNN(nn.Module):
    
    # define constructor 
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    # define feed forward
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    # define initialization of hidden state
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)



In [10]:
# instantiate rnn for simple test
n_hidden = 128
n_input  = 20
n_categories = 15


rnn = RNN(n_input, n_hidden, n_categories)