In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import os
import matplotlib.pyplot as plt
import seaborn as sns 
from catboost import Pool, CatBoostClassifier
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import extract_features



In [2]:
data_folder = "/Users/georgyguryev/Documents/repos/6.867/PLAsTiCC-Astronomical-Classification/data/class_data/"
train = pd.read_csv(data_folder+'class_training_set.csv')
meta_train = pd.read_csv(data_folder+'class_training_set_meta.csv')
test = pd.read_csv(data_folder+'class_test_set.csv')
meta_test = pd.read_csv(data_folder+'class_test_set_meta.csv')

In [3]:
fcp = {'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},{'coeff': 1, 'attr': 'abs'}],'kurtosis' : None, 'skewness' : None,'absolute_sum_of_changes':None,'ar_coefficient':[{'coeff': 1, 'k': 5},{'coeff': 2, 'k': 5},{'coeff': 3, 'k': 5},{'coeff': 4, 'k': 5},{'coeff': 5, 'k': 5}],'partial_autocorrelation':[{'lag':5}],'linear_trend':[{'attr':'slope'}],'fft_aggregated':[{'aggtype':'centroid'}],'sample_entropy':None,'abs_energy':None,'last_location_of_maximum':None,'last_location_of_minimum':None,}
def our_feature(df):
    part1 = df[['object_id','mjd','passband','flux']]
    part2 = df[['object_id','mjd','passband','flux_err']]
    part3 = df[['object_id','mjd','passband','detected']]
    part1 = pd.melt(part1,id_vars=['object_id','mjd','passband'],value_name='flux').pivot_table(index =['object_id','mjd'],columns='passband')
    part2 = pd.melt(part2,id_vars=['object_id','mjd','passband'],value_name='flux_err').pivot_table(index =['object_id','mjd'],columns='passband')
    part3 = pd.melt(part3,id_vars=['object_id','mjd','passband'],value_name='detected').pivot_table(index =['object_id','mjd'],columns='passband')
    
    time_series_df = (part1.join(part2)).join(part3)
    
    def diff_1(series):
        series.max() - series.min()
    
    def diff_2(series):
        (series.max() - series.min())/series.mean()
        
    simple_features = time_series_df.groupby('object_id').agg(['mean','std','min','max','skew',diff_1,diff_2])
    
    agg_df_ts = extract_features(df, column_id='object_id', column_sort='mjd', column_kind='passband', column_value = 'flux', default_fc_parameters = fcp, n_jobs=4)
    
    output = pd.merge(simple_features,agg_df_ts,left_index=True,right_index=True)
    
    return output

In [4]:
agg_train= featurize(train)

Feature Extraction: 100%|██████████| 25/25 [00:05<00:00,  6.95it/s]
Feature Extraction: 100%|██████████| 25/25 [00:00<00:00, 65.97it/s]


In [5]:
full_train = agg_train.reset_index().merge(
    right=meta_train,
    how='outer',
    on='object_id'
)

if 'target' in full_train:
    y = full_train['target']
    del full_train['target']
classes = sorted(y.unique())

class_weight = {
    c: 1 for c in classes
}
for c in [64, 15]:
    class_weight[c] = 2

print('Unique classes : ', classes)

if 'object_id' in full_train:
    oof_df = full_train[['object_id']]
    del full_train['object_id'], full_train['distmod'], full_train['hostgal_specz']
    del full_train['ra'], full_train['decl'], full_train['gal_l'],full_train['gal_b'],full_train['ddf']
    
train_mean = full_train.mean(axis=0)
full_train.fillna(0, inplace=True)

Unique classes :  [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]


In [6]:
def pre_process(df_, meta_, features, train_mean):
    # Group by object id    
    agg_ = featurize(df_)
    # Merge with meta data
    full_test = agg_.reset_index().merge(
        right=meta_,
        how='left',
        on='object_id'
    )

    full_test = full_test.fillna(0)
    return full_test

In [7]:

import time

start = time.time()
chunks = 5000000
remain_df = None

for i_c, df in enumerate(pd.read_csv(data_folder+'class_test_set.csv', chunksize=chunks, iterator=True)):
    unique_ids = np.unique(df['object_id'])
    new_remain_df = df.loc[df['object_id'] == unique_ids[-1]].copy()
    if remain_df is None:
        df = df.loc[df['object_id'].isin(unique_ids[:-1])]
    else:
        df = pd.concat([remain_df, df.loc[df['object_id'].isin(unique_ids[:-1])]], axis=0)
    # Create remaining samples df
    remain_df = new_remain_df
    preds_df = pre_process(df_=df,
                             meta_=meta_test,
                             features=full_train.columns,
                             train_mean=train_mean)

    if i_c == 0:
        preds_df.to_csv(data_folder+'test_data_clean.csv', header=True, mode='a', index=False)
    else:
        preds_df.to_csv(data_folder+'test_data_clean.csv', header=False, mode='a', index=False)

    del preds_df
    
    print('%15d done in %5.1f minutes' % (chunks * (i_c + 1), (time.time() - start) / 60), flush=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
Feature Extraction: 100%|██████████| 25/25 [00:02<00:00,  9.93it/s]
Feature Extraction: 100%|██████████| 25/25 [00:00<00:00, 155.40it/s]


        5000000 done in   0.1 minutes


In [13]:
-np.log(0.66)

0.4155154439616658