In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss


  if 'order' in inspect.getargspec(np.copy)[0]:


In [2]:
random.seed(2016)

def run_xgb(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 3
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 500
    early_stopping_rounds = 50
    test_size = 0.3

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score


def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\n')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table


def read_train_test():
    # Events
    print('Read events...')
    events = pd.read_csv("input/events.csv", dtype={'device_id': np.str})
    events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
    events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')

    # Phone brand
    print('Read brands...')
    pbd = pd.read_csv("input/phone_brand_device_model.csv", dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    pbd = map_column(pbd, 'phone_brand')
    pbd = map_column(pbd, 'device_model')

    # Train
    print('Read train...')
    train = pd.read_csv("input/gender_age_train.csv", dtype={'device_id': np.str})
    train = map_column(train, 'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)

    # Test
    print('Read test...')
    test = pd.read_csv("input/gender_age_test.csv", dtype={'device_id': np.str})
    test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)

    # Features
    features = list(test.columns.values)
    features.remove('device_id')

    return train, test, features



In [3]:
#train, test, features = read_train_test()
#print('Length of train: ', len(train))
#print('Length of test: ', len(test))
#print('Features [{}]: {}'.format(len(features), sorted(features)))

In [4]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

In [5]:
events = pd.read_csv("input/events.csv", dtype={'device_id': np.str})
events = events.sort_values(['device_id','timestamp'])
events['timestamp'] = pd.to_datetime(events.timestamp)

In [6]:
events.ix[events.longitude==0, 'longitude']=np.nan
events.ix[events.latitude==0, 'latitude']=np.nan
events['longitude'] = events.groupby(['device_id'])['longitude'].fillna(method='ffill').fillna(method='backfill')
events['latitude'] = events.groupby(['device_id'])['latitude'].fillna(method='ffill').fillna(method='backfill')
events['longitude'] = events.groupby(['device_id'])['longitude'].transform(np.median)
# should use mode and find out the distance travels instead
events['latitude'] = events.groupby(['device_id'])['latitude'].transform(np.median) 

In [7]:
events["hour"] = pd.DatetimeIndex(events.timestamp).hour
events["day_or_night"] = 'day'
events.ix[events.hour >=23 | (events.hour <= 7) , "day_or_night"] = 'night'
events.drop('hour', axis=1, inplace=True)

tmp = pd.DataFrame(events.groupby('device_id').apply(lambda x: np.log(1+len(x))).rename('counts'))
events = events.join(tmp, on=['device_id'])

tmp = pd.DataFrame(events.ix[events.day_or_night=='day',:].groupby('device_id').apply(lambda x: np.log(1+len(x))).rename('day_counts'))
events = events.join(tmp, on=['device_id'])

tmp = pd.DataFrame(events.ix[events.day_or_night=='night',:].groupby('device_id').apply(lambda x: np.log(1+len(x))).rename('night_counts'))
events = events.join(tmp, on=['device_id'])
events.drop(['day_or_night', 'timestamp'], axis=1, inplace=True)
events = events.drop_duplicates()

In [8]:
events["day_counts"] = events["day_counts"].fillna(np.log(1))
events["night_counts"] = events["night_counts"].fillna(np.log(1))

In [9]:
events = events.sort_values('event_id')

In [16]:
app = pd.read_csv("input/app_events.csv", dtype={'app_id': np.str}).drop_duplicates()
app_labels = pd.read_csv('input/app_labels.csv').drop_duplicates()
labels_categories = pd.read_csv('input/label_categories.csv').drop_duplicates()
app_labels = pd.merge(app_labels, labels_categories, how='left', on='label_id')
app = pd.merge(app, app_labels, how='left', on='app_id')
app.drop(['is_installed', 'is_active', 'label_id'], axis=1, inplace=True)

In [11]:
events = pd.merge(events, app, how='left')

In [12]:
events['app_count'] = events.groupby('device_id')['app_id'].transform('count')
events['app_category_count'] = events.groupby('device_id')['category'].transform('count')

In [14]:
events

Unnamed: 0,event_id,device_id,longitude,latitude,counts,day_counts,night_counts,app_id,category,app_count,app_category_count
0,1,29182687948017175,121.38,31.24,5.549076,3.044522,5.468060,,,18440,18440
1,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,5.927333e+18,Property Industry 1.0,2669,2669
2,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,5.927333e+18,Relatives 1,2669,2669
3,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,5.927333e+18,Property Industry 2.0,2669,2669
4,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,5.927333e+18,Industry tag,2669,2669
5,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,5.927333e+18,IM,2669,2669
6,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,-5.720079e+18,Personal Effectiveness 1,2669,2669
7,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,-5.720079e+18,Property Industry 2.0,2669,2669
8,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,-5.720079e+18,Industry tag,2669,2669
9,2,-6401643145415154744,103.64,30.97,4.304065,2.772589,4.077537,-5.720079e+18,unknown,2669,2669


In [None]:
events.drop('event_id', axis=1, inplace=True)
events.drop('app_id', axis=1, inplace=True)
events.drop_duplicates(inplace=True)

In [None]:
events_small  = events.drop('category', axis=1).drop_duplicates()

In [None]:
# Phone brand
print('Read brands...')
pbd = pd.read_csv("input/phone_brand_device_model.csv", dtype={'device_id': np.str})
pbd.drop_duplicates('device_id', keep='first', inplace=True)
pbd = map_column(pbd, 'phone_brand')
pbd = map_column(pbd, 'device_model')

In [None]:
# Train

print('Read train...')
train = pd.read_csv("input/gender_age_train.csv", dtype={'device_id': np.str})
train = map_column(train, 'group')
train = train.drop(['age'], axis=1)
train = train.drop(['gender'], axis=1)
train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)


In [None]:
# Test
print('Read test...')
test = pd.read_csv("input/gender_age_test.csv", dtype={'device_id': np.str})
test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)

In [None]:
# Features
features = list(test.columns.values)
features.remove('device_id')

In [None]:
test_prediction, score = run_xgb(train, test, features, 'group')
print("LS: {}".format(round(score, 5)))

In [None]:
create_submission(score, test, test_prediction)