In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import lightgbm as lgb
import xgboost as xgb
import sklearn.datasets
import gc

### Data Prep.

In [16]:
train_X = pd.read_csv('music_data/train_X.csv')
train_y = pd.read_csv('music_data/train_Y.csv')

val_X = pd.read_csv('music_data/valid_X.csv')
val_y = pd.read_csv('music_data/valid_Y.csv')

test_X = pd.read_csv('music_data/test_X.csv')
test_y = pd.read_csv('music_data/test_Y.csv')


train_tiny_X = pd.read_csv('music_data/train_tiny_X.csv')
train_tiny_y = pd.read_csv('music_data/train_tiny_Y.csv')

val_tiny_X = val_X[:1000]
val_tiny_y = val_y[:1000]

In [45]:
# Observing reasonable date for date time conversion
# 2004 seems to be the earliest date. '1970-01-01' is essentially null value (1 in train).
print('train x', train_X['registration_init_time'].min(), train_X['expiration_date'].min())
print('val x', val_X['registration_init_time'].min(), val_X['expiration_date'].min())
print('test x', test_X['registration_init_time'].min(), test_X['expiration_date'].min())


# Not many records smaller than '2000-01-01'. Reasonable to not treat them specially
print(train_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date == '1970-01-01').sum(), '1970-01-01 in val set')

print(train_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in train set')
print(val_X['expiration_date'].apply(lambda date: date < '2000-01-01').sum(), 'smaller than 2000-01-01 in val set')

train x 2004-03-26 1970-01-01
val x 2004-03-26 1970-01-01
test x 2004-03-26 2004-10-16
1 1970-01-01 in train set
2 1970-01-01 in val set
1 smaller than 2000-01-01 in train set
2 smaller than 2000-01-01 in val set


In [53]:
def date_to_int(column, base_date=datetime.strptime('2000-01-01', "%Y-%m-%d")):
    """
    Convert date to day counts since base_date for given columns.
    
    :param column: pandas column containing date representations as str value
    :param base_date: base date from which date is counted. i.e. 2000-01-02 will be day '1' comparing to '2000-01-01'
    """
    
    def date_diff(date):
        date = datetime.strptime(date, "%Y-%m-%d")
        return (date - base_date).days
    
    return column.apply(date_diff)

def convert_date_columns_to_int(dataframe, cols=['expiration_date', 'expiration_date']):
    for col in cols:
        dataframe[col+'_int'] = date_to_int(dataframe[col])
    dataframe.drop(columns=cols)


In [55]:
# for x in [train_X, val_X, test_X, train_tiny_X, val_tiny_X]:
convert_date_columns_to_int(val_tiny_X)
display(val_tiny_X['expiration_date_int'])


# TODO: Convert all X, save to music_date/temp directory with name train_X_date_converted etc.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0      6849
1      6475
2      6779
3      6460
4      6476
5      6460
6      6488
7      6472
8      6482
9      6528
10     6404
11     6461
12     6472
13     6245
14     6525
15     6766
16     6475
17     6826
18     6487
19     6483
20     6713
21     6464
22     6612
23     6442
24     6402
25     6403
26     6504
27     6460
28     6472
29     6430
       ... 
970    6192
971    6474
972    6469
973    6463
974    6204
975    6487
976    6463
977    6486
978    6398
979    6484
980    6479
981    6499
982    6482
983    6270
984    6482
985    6485
986    6485
987    6429
988    6465
989    6373
990    6460
991    6530
992    6476
993    6701
994    6483
995    6478
996    6493
997    6461
998    6507
999    6487
Name: expiration_date_int, Length: 1000, dtype: int64

## Gradient Boosting

In [5]:
def run_lgbc(train_X, train_y, val_X, val_y):
    params = {
        "objective" : "binary",
        "metric" : "binary_logloss", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)

    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return model, pred_val_y

In [11]:
print(train_tiny_X['registration_init_time'][1])
print(train_tiny_X['expiration_date'][2])

2013-09-09
2017-08-15


In [8]:
# TODO: Data type issue. Time is not permitted in lgbm.
# Specifically 'registration_init_time' and 'expiration_date' should be converted to int

modelC, pred_val_y = run_lgbc(train_tiny_X, train_tiny_y['target'], val_tiny_X, val_tiny_y)
# print(classification_report(val_tiny_y['target'], modelC.predict(val_tiny_X)>0.5))
# print(roc_auc_score(val_tiny_y, modelC.predict(val_tiny_X)))

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields registration_init_time, expiration_date

## Decision Tree

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
pipelines = {
    "dtclass": make_pipeline(DecisionTreeClassifier(random_state=0))
}
decisiontree_hyperparameters = {
    'decisiontreeclassifier__max_depth' : np.arange(3, 10),
    'decisiontreeclassifier__max_features' : np.arange(3, 8),
    'decisiontreeclassifier__min_samples_split' : np.arange(2, 15),
    "decisiontreeclassifier__min_samples_leaf" : np.arange(1,3)
}
dtclass_model = RandomizedSearchCV(pipelines['dtclass'],decisiontree_hyperparameters,n_iter = 100,cv=3, scoring = 'roc_auc')
dtclass_model.fit(train_tiny_X, train_tiny_y['target'])
print(dtclass_model.best_params_)