In [46]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,auc,log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
import matplotlib
import matplotlib.pyplot as plt

import csv
import itertools
import os
import random

from collections import Counter
from math import *

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelBinarizer

train_file = 'original data\\original_train.csv'
test_ids_file = 'original data\\original_test.csv'
output_file = 'predictions.csv'

In [47]:
# read in data
with open(train_file) as file:
    reader = csv.DictReader(file, delimiter=',')
    raw_data = [row for row in reader]
print(raw_data[0])
print(len(raw_data))

with open(test_ids_file) as file:
    reader = csv.DictReader(file, delimiter=',')
    test_ids_data = [row for row in reader]
print(test_ids_data[0])
print(len(test_ids_data))

{'timestamp': '1', 'id': '00005408fb82819bf6eef036180ce1d28ca64ac455378824a882a80ed8f99bb6', 'event': '36003'}
703331
{'id': '0001da55d168196bf25f06a497b5cf414126542f4d357d61b8203d3b2c9eb4e5'}
100870


In [48]:
# split test and train rows
test_id_set = set([i['id'] for i in test_ids_data])
raw_train_data = raw_data 
raw_test_data = [i for i in raw_data if i['id'] in test_id_set]


In [49]:
# sort by id, then timestamp
sorted_raw_train_data = sorted(raw_train_data, key = lambda x: (x['id'], int(x['timestamp'])))
for i in range(3):
    print sorted_raw_train_data[i]
    
sorted_raw_test_data = sorted(raw_test_data, key = lambda x: (x['id'], int(x['timestamp'])))
for i in range(3):
    print sorted_raw_test_data[i]

{'timestamp': '1', 'id': '00005408fb82819bf6eef036180ce1d28ca64ac455378824a882a80ed8f99bb6', 'event': '36003'}
{'timestamp': '2', 'id': '00005408fb82819bf6eef036180ce1d28ca64ac455378824a882a80ed8f99bb6', 'event': '36003'}
{'timestamp': '3', 'id': '00005408fb82819bf6eef036180ce1d28ca64ac455378824a882a80ed8f99bb6', 'event': '30018'}
{'timestamp': '26', 'id': '0001da55d168196bf25f06a497b5cf414126542f4d357d61b8203d3b2c9eb4e5', 'event': '30018'}
{'timestamp': '27', 'id': '0001da55d168196bf25f06a497b5cf414126542f4d357d61b8203d3b2c9eb4e5', 'event': '30021'}
{'timestamp': '39', 'id': '00024eca1053d4268df5c6d3308f0d008cf5e2678c443f46f83dd8a7714462c6', 'event': '30027'}


In [50]:
# count number of events by id
row_counts = [len(list(g)) for k,g in itertools.groupby(sorted_raw_train_data, key = lambda x: x['id'])]
print Counter(row_counts)
# all training ids have at least 3 records

row_counts = [len(list(g)) for k,g in itertools.groupby(sorted_raw_test_data, key = lambda x: x['id'])]
print Counter(row_counts)
# all test ids have at least 2 records

Counter({3: 107221, 4: 39512, 2: 39466, 5: 14718, 6: 5939, 7: 2409, 8: 994, 9: 499, 10: 234, 11: 153, 12: 73, 13: 37, 14: 22, 16: 9, 15: 6, 17: 3, 19: 3, 21: 3, 18: 1, 23: 1, 24: 1})
Counter({2: 39466, 3: 14718, 4: 5397, 5: 2219, 6: 879, 7: 394, 8: 165, 9: 76, 10: 47, 11: 33, 12: 14, 13: 8, 14: 4, 15: 2, 16: 2, 17: 2})


In [51]:
# group by id and construct the target variable and some predictors
# fields = 'id', 'num_events', first_event', 'last_event', last_event_timestamp', 'target_event'

def construct_features(rows, make_target = False):
    grouped_data = []
    for key, g in itertools.groupby(rows, key = lambda x: x['id']):
        group = list(g)
        if make_target == True:
            target_record = group.pop(-1)
        output_record = {'id':key, 'num_events':len(group),
                         'num_events_30018': sum([i['event']==30018 for i in group]),
                         'num_events_30021': sum([i['event']==30021 for i in group]),
                         'num_events_30024': sum([i['event']==30024 for i in group]),
                         'num_events_30027': sum([i['event']==30027 for i in group]),
                         'num_events_30039': sum([i['event']==30039 for i in group]),
                         'num_events_30042': sum([i['event']==30042 for i in group]),
                         'num_events_30045': sum([i['event']==30045 for i in group]),
                         'num_events_30048': sum([i['event']==30048 for i in group]),
                         'num_events_36003': sum([i['event']==36003 for i in group]),
                         'num_events_45003': sum([i['event']==45003 for i in group])}
        output_record['first_event'] = group[0]['event']
        output_record['last_event'] = group[-1]['event']
        output_record['last_event_timestamp'] = group[-1]['timestamp']

        if make_target == True:
            output_record['target_event'] = target_record['event']
        else:
            output_record['target_event'] = 'NA'
        grouped_data.append(output_record)
    return grouped_data

In [52]:
train_data = construct_features(sorted_raw_train_data, make_target=True)
print train_data[0]

test_data = construct_features(sorted_raw_test_data, make_target=False)
print test_data[0]

{'num_events_45003': 0, 'last_event_timestamp': '3', 'num_events': 3, 'first_event': '36003', 'num_events_36003': 0, 'id': '00005408fb82819bf6eef036180ce1d28ca64ac455378824a882a80ed8f99bb6', 'target_event': '36003', 'num_events_30048': 0, 'last_event': '30018', 'num_events_30045': 0, 'num_events_30042': 0, 'num_events_30024': 0, 'num_events_30027': 0, 'num_events_30018': 0, 'num_events_30039': 0, 'num_events_30021': 0}
{'num_events_45003': 0, 'last_event_timestamp': '27', 'num_events': 2, 'first_event': '30018', 'num_events_36003': 0, 'id': '0001da55d168196bf25f06a497b5cf414126542f4d357d61b8203d3b2c9eb4e5', 'target_event': 'NA', 'num_events_30048': 0, 'last_event': '30021', 'num_events_30045': 0, 'num_events_30042': 0, 'num_events_30024': 0, 'num_events_30027': 0, 'num_events_30018': 0, 'num_events_30039': 0, 'num_events_30021': 0}


In [53]:
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [54]:
train_df.head()

Unnamed: 0,first_event,id,last_event,last_event_timestamp,num_events,num_events_30018,num_events_30021,num_events_30024,num_events_30027,num_events_30039,num_events_30042,num_events_30045,num_events_30048,num_events_36003,num_events_45003,target_event
0,36003,00005408fb82819bf6eef036180ce1d28ca64ac4553788...,30018,3,3,0,0,0,0,0,0,0,0,0,0,36003
1,30021,000061e45fb216f4ad7fbc0cd86f620441a3091005eeb6...,30027,6,2,0,0,0,0,0,0,0,0,0,0,30042
2,30024,00008faca7acd5b2edf91b274eedc88e90b1de3b4003f9...,30039,9,2,0,0,0,0,0,0,0,0,0,0,30018
3,30027,0000b953a8b26886a7086673d8d1b7bd78efe139775728...,30018,16,6,0,0,0,0,0,0,0,0,0,0,30042
4,36003,00015a0e069313122a1e4043c63625839dcb634b7de275...,30027,20,3,0,0,0,0,0,0,0,0,0,0,30024


In [55]:
test_df = test_df.drop('target_event', 1)
test_df.head()

Unnamed: 0,first_event,id,last_event,last_event_timestamp,num_events,num_events_30018,num_events_30021,num_events_30024,num_events_30027,num_events_30039,num_events_30042,num_events_30045,num_events_30048,num_events_36003,num_events_45003
0,30018,0001da55d168196bf25f06a497b5cf414126542f4d357d...,30021,27,2,0,0,0,0,0,0,0,0,0,0
1,30027,00024eca1053d4268df5c6d3308f0d008cf5e2678c443f...,30018,40,2,0,0,0,0,0,0,0,0,0,0
2,45003,000273e55809afd4a9ac4fb9175effe5d0ea449ed37e7c...,45003,43,2,0,0,0,0,0,0,0,0,0,0
3,30018,0002bd1d73c326ad6e337a5687f6787b055f13079d5c52...,30027,47,3,0,0,0,0,0,0,0,0,0,0
4,30018,00030b6d5b8013bcb9bb23a9bccf394d7b361a01d0d6b7...,30027,55,3,0,0,0,0,0,0,0,0,0,0


# Model

In [45]:
predictors = ['num_events', 'first_event', 'last_event','num_events_30048', 'num_events_30045', 
              'num_events_30042', 'num_events_30024', 'num_events_30027', 'num_events_30018',
              'num_events_30039', 'num_events_30021']
target = 'target_event'

In [56]:
x_df = train_df[predictors]
y_df = train_df[target]

In [57]:
encode = preprocessing.LabelEncoder()
for column in x_df.columns:
    x_df.loc[:, column] = encode.fit_transform(x_df[column])

y_df = encode.fit_transform(y_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [61]:
x = x_df.values
y = y_df

In [62]:
def cv(x, y, model, n, model_name,mars=False):
    k_folds = KFold(x.shape[0], n_folds=n, shuffle=True)
    scores = []
    loss_list = []
    for train_indices, validation_indices in k_folds:
        # Generate training data
        x_train_cv = x[train_indices]
        y_train_cv = y[train_indices]
        # Generate validation data
        x_validate = x[validation_indices]
        y_validate = y[validation_indices]

        # Fit model on training data
        model.fit(x_train_cv, y_train_cv)
        # Score on validation data
        scores += [model.score(x_validate, y_validate)]
        # log_loss on validation data
        proba = model.predict_proba(x_validate)
        loss_list += [log_loss(y_validate,proba)]

    # Record and report accuracy
    average_score = np.mean(scores)
    average_log_loss = np.mean(loss_list)
    
    print "Score:", average_score
    print "Log_loss:", average_log_loss

    return average_score,average_log_loss

In [68]:
#Gradient Boosting Decision Tree
gbdt = GradientBoostingClassifier(n_estimators=200,max_depth=5)

In [None]:
gbdt_score, gbdt_log_loss = cv(x,y,gbdt,5,"Gradient Boosting Decision Tree")

In [None]:
model = gbdt.fit(x, y)

# 输出test结果

In [None]:
user_id = test['id'].values

In [None]:
x_test_df = test_df[predictors]
encode = preprocessing.LabelEncoder()
for column in x_test_df.columns:
    x_test_df.loc[:, column] = encode.fit_transform(x_test_df[column])
x_test = x_test_df.values

In [None]:
test_proba = model.predict_proba(x_test)

In [None]:
result = pd.DataFrame({'0id':user_id,'event_30018':test_proba[:,0],'event_30021':test_proba[:,1],'event_30024':test_proba[:,2],
                       'event_30027':test_proba[:,3],'event_30039':test_proba[:,4],'event_30042':test_proba[:,5],'event_30045':test_proba[:,6],
                       'event_30048':test_proba[:,7],'event_36003':test_proba[:,8],'event_45003':test_proba[:,9]})
result.head()