# Create samples from the training data

The goal of this notebook is to make samples out of the training data provided by the organizers

In [227]:
import os, sys
import pandas as pd
import numpy as np
from util import *

In [228]:
# load the data
datapath = "data/training_set.csv"
metadata = "data/training_set_metadata.csv"

df = pd.read_csv(datapath)
dfm = pd.read_csv(metadata)

print dfm.tail()

      object_id          ra       decl       gal_l      gal_b  ddf  \
7843  130739978   26.718750 -14.940303  172.342697 -72.255675    0   
7844  130755807  120.101349 -62.696659  275.742955 -16.509746    0   
7845  130762946  203.108109 -55.682144  308.728904   6.727511    0   
7846  130772921   79.101562 -35.501846  239.172243 -33.827844    0   
7847  130779836  301.992188 -17.426323   25.102988 -24.511101    0   

      hostgal_specz  hostgal_photoz  hostgal_photoz_err  distmod  mwebv  \
7843         0.0000          0.0000              0.0000      NaN  0.013   
7844         0.1725          2.5606              1.1146  46.6108  0.136   
7845         0.0000          0.0000              0.0000      NaN  0.430   
7846         0.0000          0.0000              0.0000      NaN  0.034   
7847         0.0000          0.0000              0.0000      NaN  0.091   

      target  
7843      65  
7844      90  
7845      16  
7846      65  
7847       6  


In [240]:
indexes = np.unique(df["object_id"])
bands = np.unique(df["passband"])

# in case we want to map back go the real filters
bands_mapping = {0: "u", 1: "g", 2: "r", 3: "i", 4: "z", 5: "y"}
# df["passband"].map(bands_mapping)


print dfm.head(1).T
print "\n\n"
print df.head(1).T


                             0
object_id           615.000000
ra                  349.046051
decl                -61.943836
gal_l               320.796530
gal_b               -51.753706
ddf                   1.000000
hostgal_specz         0.000000
hostgal_photoz        0.000000
hostgal_photoz_err    0.000000
distmod                    NaN
mwebv                 0.017000
target               92.000000



                      0
object_id    615.000000
mjd        59750.422900
passband       2.000000
flux        -544.810303
flux_err       3.622952
detected       1.000000


In [230]:
from tqdm import tqdm  # fancy progress prints

series_list = []
for i in tqdm(range(len(dfm))):
	dfitem = dfm.iloc[i]
	object_id = dfitem["object_id"]
	
	# pandas' not very concilient with appending data to a Series or dataframe 
	# (or I'm not good with the doc) 
	# --> I build a dictionnary that I re-translate in a Series later
	sample = dfitem.to_dict()

	# build light curves, looking at the row of df
	# that have the correct passband and id
	# it would probably be faster to loop on df lines than looking for the good ones at each step...to keep in mind when working with the test set
	for b in bands:
		sample["mjds_%s" % b] = df["mjd"][df["passband"] == b][df["object_id"] == object_id].values
		sample["fluxes_%s" % b] = df["flux"][df["passband"] == b][df["object_id"] == object_id].values
		sample["fluxerrs_%s" % b] = df["flux_err"][df["passband"] == b][df["object_id"] == object_id].values
		sample["detected_%s" % b] = df["detected"][df["passband"] == b][df["object_id"] == object_id].values
	
	# create a Series with the dict and put that in a big list
	series_list.append(pd.Series(sample))
	
# create a dataframe out of the list	
samples = pd.concat(series_list, axis=1).T

# convert what needs to be integer as integer
# ooh look Im pandas Im so fucking smart I save everything in float lolilol
samples[["object_id", "target"]] = samples[["object_id", "target"]].astype(int)



Wrote training_samples.pkl


In [231]:
# pickle the results for later use
writepickle(samples, "training_samples.pkl")

Wrote training_samples.pkl


In [243]:
samples = readpickle("training_samples.pkl")
samples.iloc[654]

Read training_samples.pkl


ddf                                                                   1
decl                                                           -46.7685
detected_0            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
detected_1            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
detected_2            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
detected_3            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
detected_4            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
detected_5            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
distmod                                                         41.6466
fluxerrs_0            [1.688356, 1.931612, 1.795236, 2.735116, 2.843...
fluxerrs_1            [0.619115, 2.848189, 1.178133, 0.971095, 3.389...
fluxerrs_2            [0.630288, 2.086514, 1.240831, 1.002995, 2.331...
fluxerrs_3            [1.065594, 2.487419, 2.088372, 1.705569, 2.485...
fluxerrs_4            [1.825245, 3.297901, 2.579784, 2.62245, 2.