In [38]:
import time
start_time = time.time()
import os.path as op
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve,auc
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


#path_to_data = op.relpath("/modules/cs342/Assignment2/")
path_to_data = "./data"

scaler_small = MinMaxScaler((-1,1))

kf = KFold(n_splits=10, shuffle=True)

light_curves = pd.read_csv(path_to_data + "/training_set.csv")
metadata     = pd.read_csv(path_to_data + "/training_set_metadata.csv")

target = metadata["target"]
metadata = metadata.drop("target", axis=1)

metadata = metadata.drop(["ra", "decl", "gal_l", "gal_b", "distmod", "hostgal_specz"], axis=1)

# drop flux error and detected flag for now
#light_curves = light_curves.drop(["flux_err", "detected"], axis=1)


print time.time() - start_time

0.936877012253


In [70]:
start_time = time.time()

tanh_scaler = StandardScaler()
tanh_scaler.fit(np.expand_dims(light_curves["flux"].values, 1))


def process(metadata, light_curves, transform_domain):
    flux = light_curves["flux"]
    
    tanh_flux = np.tanh(transform_domain*tanh_scaler.transform(np.expand_dims(flux,1)))
    tanh_curves = light_curves.copy().assign(tanh_flux = tanh_flux)

    naive_extraction = tanh_curves.drop("mjd", axis=1).groupby(["object_id", "passband"]).agg(["mean", "max", "min", "std"]).unstack("passband")

    X = np.append(metadata.values, naive_extraction, axis=1)
    X = scaler_small.fit_transform(X)
    return X


X = process(metadata, light_curves, 11)
y = target.values

print time.time() - start_time


1.42520880699


In [52]:
cross_scores = []

n_estimators = range(80,140,10)

for i, n_estimator in enumerate(n_estimators):

    model = RandomForestClassifier(n_estimators=n_estimator)
    scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))

    cross_scores.append(np.mean(scores))
    print n_estimators[i], np.mean(scores)
    
print "="*20
print n_estimators[np.argmax(cross_scores)], max(cross_scores)
    

80 0.7511448719615235
90 0.7517870141687248
100 0.7519085532302092
110 0.7480873846353828
120 0.7525432211101001
130 0.7491071428571427
120 0.7525432211101001


In [58]:
cross_scores = []

min_samples_splits = np.linspace(0, 0.0005, num=5)[1:]

for i, min_samples_split in enumerate(min_samples_splits):

    model = RandomForestClassifier(n_estimators=120, min_samples_split=min_samples_split)
    scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))

    cross_scores.append(np.mean(scores))
    print min_samples_split, np.mean(scores)
    
print "="*20
print min_samples_splits[np.argmax(cross_scores)], max(cross_scores)
    

0.000125 0.7492304692577669
0.00025 0.7521678798908098


KeyboardInterrupt: 

In [59]:
start_time = time.time()

model = RandomForestClassifier(n_estimators=120, min_samples_split=0.0025 )
model.fit(X,y)

print time.time() - start_time


22.6877129078


In [60]:
test_metadata_backup = pd.read_csv(path_to_data + "/test_set_metadata.csv").drop(["ra", "decl", "gal_l", "gal_b", "distmod", "hostgal_specz"], axis = 1)
test_metadata_backup.head(5)

Unnamed: 0,object_id,ddf,hostgal_photoz,hostgal_photoz_err,mwebv
0,13,1,0.3193,0.0542,0.019
1,14,1,0.6323,0.0179,0.018
2,17,1,0.8297,0.0605,0.016
3,23,1,0.6533,0.1479,0.023
4,34,1,0.4617,0.0122,0.023


In [74]:
test_metadata = test_metadata_backup.copy()

left_over = pd.DataFrame()
for partial_light_curves in pd.read_csv(path_to_data + "/refor_test_data.csv", chunksize=2000000):
    print "new chunck ",
    partial_light_curves = left_over.append(partial_light_curves)
    
    last_id = partial_light_curves["object_id"].max()
    left_over = partial_light_curves.loc[partial_light_curves["object_id"] == last_id]
    partial_light_curves = partial_light_curves.drop(left_over.index)

    partial_metadata = test_metadata.loc[test_metadata["object_id"] < last_id].fillna(-1)
    test_metadata = test_metadata.drop(partial_metadata.index)
    print len(test_metadata), "remaining"
    
    X = process(partial_metadata, partial_light_curves, 11)

    probs = model.predict_proba(np.nan_to_num(X))
    # predict class 99 as always 0
    probs = np.append(probs, np.zeros((len(probs), 1)), axis=1)
    # include ids
    probs = np.append(np.array([partial_metadata["object_id"].values]).T, probs, axis=1)
    probs = pd.DataFrame(probs)
    probs[0] = probs[0].astype(int)
    
    with open("./submissions/submission_random_forest_raw2.csv", "a") as ofh:
        probs.to_csv(ofh, index=False, header=False)
    

 new chunck  3486831 remaining
new chunck  3480774 remaining
new chunck  3474720 remaining
new chunck  3468645 remaining
new chunck  3462570 remaining
new chunck  3451032 remaining
new chunck  3435408 remaining
new chunck  3419801 remaining
new chunck  3404182 remaining
new chunck  3388574 remaining
new chunck  3372969 remaining
new chunck  3357321 remaining
new chunck  3341680 remaining
new chunck  3326056 remaining
new chunck  3310427 remaining
new chunck  3294801 remaining
new chunck  3279192 remaining
new chunck  3263543 remaining
new chunck  3247905 remaining
new chunck  3232267 remaining
new chunck  3216640 remaining
new chunck  3201019 remaining
new chunck  3185405 remaining
new chunck  3169765 remaining
new chunck  3154135 remaining
new chunck  3138500 remaining
new chunck  3122851 remaining
new chunck  3107221 remaining
new chunck  3091598 remaining
new chunck  3075956 remaining
new chunck  3060323 remaining
new chunck  3044706 remaining
new chunck  3029064 remaining
new chunc

In [116]:
model.predict_proba(X)

array([[0.        , 0.6       , 0.        , 0.19090909, 0.        ,
        0.        , 0.03636364, 0.02727273, 0.03636364, 0.00909091,
        0.00909091, 0.06363636, 0.        , 0.02727273]])

In [127]:
test_metadata

Unnamed: 0,object_id,ddf,hostgal_photoz,hostgal_photoz_err,mwebv
3492889,130788054,0,0.3625,0.7335,0.013
