In [3]:
import time
start_time = time.time()
import os.path as op
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

#path_to_data = op.relpath("/modules/cs342/Assignment2/")
path_to_data = "./data"

light_curves = pd.read_csv(path_to_data + "/training_set.csv")
metadata     = pd.read_csv(path_to_data + "/training_set_metadata.csv")

target = metadata["target"]
metadata = metadata.drop("target", axis=1)

metadata = metadata.drop(["ra", "decl", "gal_l", "gal_b", "distmod", "hostgal_specz"], axis=1)

classes=np.array([6,15,16,42,52,53,62,64,65,67,88,90,92,95])

kf = KFold(n_splits=10, shuffle=True)

main_bands = ["u_band", "g_band", "r_band", "i_band", "z_band", "y_band"]

# drop flux error and detected flag for now
light_curves = light_curves.drop(["flux_err", "detected"], axis=1)

print time.time() - start_time

1.53622889519


In [5]:
def feature_expansion(metadata, light_curves):
    start_time = time.time()

    data_set = metadata.copy().reset_index().drop("index", axis=1)

    curves_index = light_curves.index.min()

    start_time = time.time()

    curves_subdf = pd.DataFrame(columns=main_bands, index=xrange(len(data_set)))

    for i, current_id in enumerate(metadata["object_id"]):
        if i % 1000 == 0: print "--" + str(i)
        start_index = curves_index
        # remove gaps + set start to 0
        #print light_curves
        shift = light_curves.at[start_index,"mjd"]
        prev_row_mjd = shift
        last_index = light_curves.index.max()
        while light_curves.at[curves_index,"object_id"] == current_id:
            this_row_mjd = light_curves.at[curves_index,"mjd"]
            if this_row_mjd - prev_row_mjd > 85:
                shift += this_row_mjd - prev_row_mjd-1
            light_curves.at[curves_index,"mjd"] -= shift
            curves_index += 1
            prev_row_mjd = this_row_mjd 
            if curves_index == last_index:
                break

        sub_curves = light_curves.loc[start_index:curves_index-1, :].drop("object_id", axis=1)

        for k, band in enumerate(main_bands):
            curves_subdf.at[i,band] = sub_curves[sub_curves.passband == k].drop("passband", axis=1)


    data_set = pd.concat([
        data_set,
        curves_subdf
    ], sort=False, axis=1).drop("object_id", axis=1)
    print time.time() - start_time
    return data_set

training_set = feature_expansion(metadata, light_curves)

--0
--1000
--2000
--3000
--4000
--5000
--6000
--7000
196.32391119


In [6]:
features_per_band = 60

def interpolate(time_series):
    places = np.linspace(0, time_series["mjd"].max(), features_per_band)
    ret = np.empty(features_per_band)
    i = 0
    for j, row in time_series.iterrows():
        if row["mjd"] >= places[i]:
            ret[i] = row["flux"]
            i += 1
    return ret


def interpolate_bands(data_set):
    start_time = time.time()
    X = data_set[["ddf", "hostgal_photoz", "hostgal_photoz_err", "mwebv"]].values
    print "--",
    for band in main_bands:
        print band[0],
        processed_X = np.empty((len(data_set), features_per_band))
        for i, row in data_set.iterrows():
            processed_X[i] = interpolate(row[band])
        X = np.append(X, processed_X, axis=1)
    print    
    print time.time() - start_time
    return X

X = interpolate_bands(training_set)
y = target.values

-- u g r i z y
98.9375021458


In [13]:
start_time = time.time()
#print light_curves.columns

naive_extraction = light_curves.drop("mjd", axis=1).groupby(["object_id", "passband"]).agg(["mean", "max", "min", "std"]).unstack("passband").values

X = np.append(metadata.values, naive_extraction, axis=1)
y = target.values

print time.time() - start_time

0.284232854843


In [7]:
start_time = time.time()

model = RandomForestClassifier(n_estimators=140, max_depth=16)
model.fit(X,y)

print time.time() - start_time


27.2858309746


In [8]:
test_metadata_backup = pd.read_csv(path_to_data + "/test_set_metadata.csv").drop(["ra", "decl", "gal_l", "gal_b", "distmod", "hostgal_specz"], axis = 1)

In [14]:
test_metadata = test_metadata_backup.copy()

left_over = pd.DataFrame()
for partial_light_curves in pd.read_csv(path_to_data + "/refor_test_data.csv", chunksize=1500000):
    print "new chunck"
    partial_light_curves = left_over.append(partial_light_curves.drop(["flux_err", "detected"], axis=1))
    
    last_id = partial_light_curves["object_id"].max()
    left_over = partial_light_curves.loc[partial_light_curves["object_id"] == last_id]
    partial_light_curves = partial_light_curves.drop(left_over.index)

    # can be made more efficent
    partial_metadata = test_metadata.loc[test_metadata["object_id"] < last_id]
    test_metadata = test_metadata.drop(partial_metadata.index)
    
    print "reformating"
    test_set = feature_expansion(partial_metadata, partial_light_curves)

    print "interpolating"
    X = interpolate_bands(test_set)

    print "predicting"
    probs = model.predict_proba(X)
    probs = np.append(probs, np.zeros((len(probs), 1)), axis=1)
    probs = np.append(np.array([partial_metadata["object_id"].values]).T, probs, axis=1)
    probs = pd.DataFrame(probs)
    probs[0] = probs[0].astype(int)
    print "writing"
    with open("./submissions/submission_random_forest_raw.csv", "a") as ofh:
        pd.DataFrame(probs).to_csv(ofh, index=False, header=False)
    

KeyboardInterrupt: 

In [161]:
partial_metadata

Unnamed: 0,object_id,ddf,hostgal_photoz,hostgal_photoz_err,mwebv
92,883,1,0.8622,0.0603,0.006
93,886,1,0.4476,0.7778,0.019
94,887,1,1.2642,0.2378,0.009
95,888,1,1.2159,0.1488,0.009
96,905,1,0.6630,0.0275,0.007
97,916,1,0.4393,0.0217,0.016
98,917,1,1.3559,0.2676,0.009
99,943,1,0.8663,0.0291,0.011
100,946,1,1.5205,0.0959,0.010
101,960,1,0.6480,0.0129,0.023


In [162]:
test_set

Unnamed: 0,ddf,hostgal_photoz,hostgal_photoz_err,mwebv,u_band,g_band,r_band,i_band,z_band,y_band
0,,,,,mjd flux 29848 25.9406 -1.0...,mjd flux 29834 0.0076 -0.8...,mjd flux 29833 0.0000 -0...,mjd flux 29835 0.0152 3...,mjd flux 29836 0.0262 5...,mjd flux 29837 0.0371 7...
1,,,,,mjd flux 30193 19.9535 0...,mjd flux 30184 0.0076 -0...,mjd flux 30183 0.0000 -2...,mjd flux 30185 0.0152 2...,mjd flux 30186 0.0261 -1...,mjd flux 30187 0.0371 -0...
2,,,,,mjd flux 30553 47.8557 0.0...,mjd flux 30514 0.0078 -0.1...,mjd flux 30513 0.0000 0.0...,mjd flux 30515 0.0155 0.3...,mjd flux 30516 0.0266 0...,mjd flux 30517 0.0377 -9...
3,,,,,mjd flux 30904 47.8557 -1.9...,mjd flux 30865 0.0078 -0.0...,mjd flux 30864 0.0000 -0.2...,mjd flux 30866 0.0155 0...,mjd flux 30867 0.0266 0...,mjd flux 30868 0.0377 1...
4,,,,,mjd flux 31255 47.8557 -0.7...,mjd flux 31216 0.0078 0.0...,mjd flux 31215 0.0000 0.4...,mjd flux 31217 0.0155 0.4...,mjd flux 31218 0.0266 1...,mjd flux 31219 0.0377 1...
5,,,,,mjd flux 31566 0.0000 -0...,mjd flux 31572 5.8984 -1...,mjd flux 31571 5.8908 ...,mjd flux 31573 5.9061 ...,mjd flux 31574 5.9170 ...,mjd flux 31575 5.9280 ...
6,,,,,mjd flux 31861 47.8557 3.1...,mjd flux 31822 0.0078 0.5...,mjd flux 31821 0.0000 0...,mjd flux 31823 0.0155 0...,mjd flux 31824 0.0266 0...,mjd flux 31825 0.0377 -3...
7,,,,,mjd flux 32212 47.8557 0.8...,mjd flux 32173 0.0078 -0.1...,mjd flux 32172 0.0000 -0...,mjd flux 32174 0.0155 1...,mjd flux 32175 0.0266 -1...,mjd flux 32176 0.0377 ...
8,,,,,mjd flux 32563 47.8557 1.1...,mjd flux 32524 0.0078 0.2...,mjd flux 32523 0.0000 -0.4...,mjd flux 32525 0.0155 2...,mjd flux 32526 0.0266 2...,mjd flux 32527 0.0377 7...
9,,,,,mjd flux 32928 68.7303 -1.3...,mjd flux 32875 0.0077 0.7...,mjd flux 32874 0.0000 -0...,mjd flux 32876 0.0154 0...,mjd flux 32877 0.0221 -4...,mjd flux 32882 2.0206 -15...


In [96]:
len(classes)

14

In [101]:
target.unique()

array([92, 88, 42, 90, 65, 16, 67, 95, 62, 15, 52,  6, 64, 53])