In [86]:
import os
import math

import time
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm as cm
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [87]:
_train_=pd.read_csv('../input/train.csv')
places=_train_['place_id'].unique()

_train_.sort(['time'], inplace=True)
trainRatio=0.7
train=_train_[:int(len(_train_)*trainRatio)]
train=train[train['accuracy']<1000]

placeCounts=train['place_id'].value_counts()



valid=_train_[int(len(_train_)*trainRatio):]



In [88]:
def featureFactory(dataFrame):
    print('Feature Augmentation')
    minute = dataFrame.time%60
    dataFrame['hour'] = dataFrame['time'].div(60).map(int)
    dataFrame.drop(['time'], axis=1, inplace=True)
    dataFrame['weekday'] = dataFrame['hour'].div(24).map(int)
    dataFrame['month'] = dataFrame['weekday'].div(30).map(int)
    dataFrame['year'] = (dataFrame['weekday'].div(365).map(int)+1)*10.0
    dataFrame['hour'] = ((dataFrame.loc[:,'hour'].values%(24)+1)+minute.div(60.0))*(4.0)


    dataFrame['weekday'] = (dataFrame['weekday']%7+1)*3.0
    dataFrame['month'] = (dataFrame['month']%12+1)*2.0
    dataFrame['accuracy'] = np.log10(dataFrame['accuracy'])*10.0
    dataFrame['originalIndex'] = np.arange(len(dataFrame))

    print ('processing is done')
f=['x','y', 'accuracy','hour', 'weekday', 'month', 'year']

In [89]:
def amplifier(dataFrame):    # add data for periodic time that hit the boundary
    add_data1 = dataFrame[dataFrame.hour<6]
    add_data1.hour = add_data1.hour+96

    add_data2 = dataFrame[dataFrame.hour>98]
    add_data2.hour = add_data2.hour-96
    return add_data1.append(add_data2)
    
    print 'amp is done'

In [90]:
featureFactory(train)
train=train.append(amplifier(train))
featureFactory(valid)
# pd.options.mode.chained_assignment = None


Feature Augmentation
processing is done
Feature Augmentation
processing is done


In [None]:
# test=pd.read_csv('../input/test.csv')
# featureFactory(test)

In [41]:
g=train.groupby('place_id')
places=pd.DataFrame()
places['xMean']=g['x'].mean()
places['xStd']=g['x'].std()

places['yMean']=g['y'].mean()
places['yStd']=g['y'].std()

sTime=time.time()

train=train[(abs(train['y']-places.loc[train['place_id'], 'yMean'].values)<5*places.loc[train['place_id'], 'yStd'].values)\
         & (abs(train['x']-places.loc[train['place_id'], 'xMean'].values)<5*places.loc[train['place_id'], 'xStd'].values)\
           
           ]

print time.time()-sTime

180.948864937


In [95]:
nXRegions=20
nYRegions=40
xMin=0
xMax=2
yMin=0
yMax=2
nXRegions=nXRegions*int(xMax-xMin)/10
nYRegions=nYRegions*int(yMax-yMin)/10
# train=train[(train['x']<xMax) & (train['x']>=xMin) & (train['y']<yMax) & (train['y']>=yMin)]
# valid=valid[(valid['x']<xMax) & (valid['x']>=xMin) & (valid['y']<yMax) & (valid['y']>=yMin)]

print xMin, xMax, yMin, yMax, nXRegions, nYRegions

0 2 0 2 4 8


In [43]:
# def regionalForestPred(pred, classes, regionalPoints, data):

    
#     prob=[sorted(zip(p, classes))[-3:][::-1] for p in pred]
    
    
#     regionalPred=[]
#     for i in xrange(len(data)):
#         sampleY=data['y'].iloc[i]
#         sampleX=data['x'].iloc[i]
#         sampleDay=data['day'].iloc[i]

#         filteredPlaces=filter(lambda x: \
#                               sampleY<=yMaxMap[x[1]]+yAllowance and sampleY>=yMinMap[x[1]]-yAllowance and \
#                               sampleX<=xMaxMap[x[1]] and sampleX>=xMinMap[x[1]] and \
#                               sampleDay<=dayMaxMap[x[1]] and sampleDay>=dayMinMap[x[1]], prob[i])
#         if filteredPlaces:
#             filteredPlaces=map(lambda x: [x[0]*regionalPoints, x[1]], filteredPlaces)
#         regionalPred.append(filteredPlaces)
#     return regionalPred

In [99]:
def train_test(modelSerialization=False, resultSerialization=False, model='rf', predictSet=valid, th=0, xMargin=0.05, yMargin=0.025):
    
    if modelSerialization or resultSerialization:
        base='./'
        folderName=model+'-'+time.strftime('%c')
        folderPath=base+folderName+'/'
        if not os.path.exists(folderPath):
            os.makedirs(folderPath)
            
    
#     f=['x','y','xTimeY','xDivY', 'accuracy', 'day', 'weekday', 'hourInDay','hourInDayShifted2', 'month', 'year']
#     f=['x','y', 'accuracy', 'day','weekday', 'hourInDay','hourInDayShifted2', 'month', 'year']
    f=['x','y', 'accuracy','hour', 'weekday', 'month', 'year']

        
    count=0
    accuracies=[]
    FBscores=[]
    
    startTime=time.time()
    trainingTimes=[]
    predictionTimes=[]
    
    testNum=0

    roundNum=5

    for xNum, yNum in ((xn, yn) for xn in xrange(nXRegions) for yn in xrange(nYRegions)):

        xStep=round(1.0*(xMax-xMin)/nXRegions,roundNum)
        xStart=round(xMin+xNum*xStep,roundNum)
        xEnd=round(xStart+xStep,roundNum)

        yStep=round(1.0*(yMax-yMin)/nYRegions,roundNum)
        yStart=round(yMin+yNum*yStep,roundNum)
        yEnd=round(yStart+yStep,roundNum)
        
#         regionalTrain=train[(train['x']<xEnd) & (train['x']>=xStart) & (train['y']<yEnd) & (train['y']>=yStart)]

        regionalTrain=train[(train['x']<xEnd+xMargin) & (train['x']>=xStart-xMargin) & (train['y']<yEnd+yMargin) & (train['y']>=yStart-yMargin)]
#         print xStart, xEnd, yStart, yEnd, len(regionalTrain)

        
            
#         regionalTrain['sampleWeight']=


        placesCounts=regionalTrain['place_id'].value_counts()
        regionalTrain=regionalTrain[(placeCounts[regionalTrain['place_id'].values]>th).values]
        
#         tail=len(regionalTrain)/6
#         latest=regionalTrain.sort_values(by=['time'])[:tail]
#         regionalTrain=regionalTrain.append(latest)
        
        
#         addition=regionalTrain[regionalTrain['hourInDay']<=4]
#         addition['hourInDay']=addition['hourInDay']+24
#         regionalTrain=regionalTrain.append(addition)
        
#         addition=regionalTrain[regionalTrain['hourInDay']>=20]
#         addition['hourInDay']=addition['hourInDay']-24
#         regionalTrain=regionalTrain.append(addition)


        
        if len(regionalTrain):
#             regionalPredictSet=predictSet[(predictSet['x']<xEnd-margin) & (predictSet['x']>=xStart+margin) & (predictSet['y']<yEnd-margin) & (predictSet['y']>=yStart+margin)]
            regionalPredictSet=predictSet[(predictSet['x']<xEnd) & (predictSet['x']>=xStart) & (predictSet['y']<yEnd) & (predictSet['y']>=yStart)]

            testNum+=len(regionalPredictSet)

            if model=='knn':
                regionalTrain.loc[:,'x'] *= 500.0
                regionalTrain.loc[:,'y'] *= 1000.0
                regionalPredictSet.loc[:,'x'] *= 500.0
                regionalPredictSet.loc[:,'y'] *= 1000.0
#                 fWeights={'x': 500, 'y': 1000, 'accuracy':0, 'day': 0,'dayInMonth': 0, 'weekday': 3,\
#                           'hourInDay': 4, 'hourInDayShifted2': 0, 'month': 2, 'year': 10}
#                 for ft in f:
#                     regionalTrain[ft]=regionalTrain[ft].values*fWeights[ft]
#                     regionalPredictSet[ft]=regionalPredictSet[ft].values*fWeights[ft]
                
            else:
                fMean=regionalTrain[f].mean()
                fStd=regionalTrain[f].std()

                for ft in f:       
                    regionalTrain[ft]=(regionalTrain[ft].values-fMean[ft])/(fStd[ft])
                    regionalPredictSet[ft]=(regionalPredictSet[ft].values-fMean[ft])/(fStd[ft])
#                 print regionalTrain['x'].describe()

      
            le=LabelEncoder()
            regionalTrain['le']=le.fit_transform(regionalTrain['place_id'].values)

#             s_w=1**(15-regionalTrain['day'].values/30.0)
            
            trainingStartTime=time.time()
            if model=='rf':
                s_w_rf=3**(-regionalTrain['day'].values)
#                 s_w_rf=1.65-regionalTrain['day'].values
    #             clf=RandomForestClassifier(n_jobs=-1, n_estimators=300, max_features=None).fit(regionalTrain[f], regionalTrain['le'])   
            
                clf=RandomForestClassifier(n_jobs=-1, n_estimators=150, random_state=0).fit(regionalTrain[f], regionalTrain['le'], sample_weight=s_w_rf)  

            if model=='xgb':
                clf=XGBClassifier(learning_rate=0.02, n_estimators=220, objective='multi:softprob', max_depth=3, seed=0).fit(regionalTrain[f], regionalTrain['le'])
            
            
            if model=='knn':
                clf=KNeighborsClassifier(n_neighbors=25, weights='distance',metric='manhattan', n_jobs=-1).fit(regionalTrain[f], regionalTrain['le'])
         
            trainingTimes.append(time.time()-trainingStartTime)
            
            
            if modelSerialization:
                modelFileName='{:04d}-{}-{}-{}-{}.clf'.format(count, xStart, xEnd, yStart, yEnd)
                modelFilePath=folderPath+modelFileName
                with open(modelFilePath, 'ab+') as fo:
                    pickle.dump(clf, fo, pickle.HIGHEST_PROTOCOL)
            
            
            predictionStartTime=time.time()
            prob=clf.predict_proba(regionalPredictSet[f])
            predictionTimes.append(time.time()-predictionStartTime)
            
            pred=[sorted(zip(p, clf.classes_))[-3:][::-1] for p in prob]
            prediction=le.inverse_transform([zip(*p)[1] for p in pred])
            confidence=[zip(*p)[0] for p in pred]
            
            
                
            fbscoreForSerialization=-1
            if 'place_id' in predictSet.columns:
                regionalAccuracy=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                regionalConfidence=[sum(z)/len(regionalPredictSet) for z in zip(*confidence)]
                regionalConfidence3=sum(regionalConfidence)

                fbAccuracy=0
                fbAccuracy+=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][:1] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/2*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][1:2] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/3*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][2:3] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                FBscores.append(fbAccuracy)
                print 'region {}: {},{} accuracy: {},  fbAccu: {}, confidence: {}:'.format(count, xNum, yNum, regionalAccuracy, fbAccuracy, regionalConfidence3)
                accuracies.append(regionalAccuracy)
                if resultSerialization:
                    fbscoreForSerialization=fbAccuracy
                
            if resultSerialization:
                resultFileName='{:04d}-{}-{}-{}-{}.rst'.format(count, xStart, xEnd, yStart, yEnd)
                resultFilePath=folderPath+resultFileName
                predColumns=zip(*prediction)
                confColumns=zip(*confidence)

                
                if len(predColumns[2])!=len(prediction):
                    print 'missing values',count, len(predColumns[0])
                
                results=pd.DataFrame({'originalIndex': regionalPredictSet['originalIndex'].tolist(),\
                                      'x':regionalPredictSet['x'].tolist(), 'y':regionalPredictSet['y'].tolist(), \
                                      'accuracy':regionalPredictSet['accuracy'].tolist(), 'pred0':predColumns[0], \
                                      'pred1':predColumns[1], 'pred2':predColumns[2], 'conf0': confColumns[0], \
                                      'conf1': confColumns[1], 'conf2': confColumns[2], 'regionalFBScore': [fbscoreForSerialization]*len(regionalPredictSet)})

                results.to_csv(resultFilePath)



        count+=1
        if count%10==0:

            print '{} : total time {} s.'.format(count, time.time()-startTime)
            print 'Average training Time: ', sum(trainingTimes)/len(trainingTimes)
            print 'Average prediction Time: ', sum(predictionTimes)/len(predictionTimes)
            print 'Average FB Score', np.mean(FBscores)
            print 
            
            startTime=time.time()
            trainingTimes=[]
            predictionTimes=[]


    print 
    print np.mean(accuracies)
    print np.var(accuracies)
    print
    print np.mean(FBscores)
    print testNum
    print 'done'
    
    
    
    

In [100]:
train_test(modelSerialization=False, model='knn', resultSerialization=False, predictSet=valid, th=0, xMargin=0.06, yMargin=0.03)

region 0: 0,0 accuracy: 0.680778032037,  fbAccu: 0.579915507833, confidence: 0.808461477913:
region 1: 0,1 accuracy: 0.639189407395,  fbAccu: 0.537031056569, confidence: 0.789619850518:
region 2: 0,2 accuracy: 0.631753859348,  fbAccu: 0.535752572899, confidence: 0.760886439019:
region 3: 0,3 accuracy: 0.636774193548,  fbAccu: 0.539646697389, confidence: 0.798352838264:
region 4: 0,4 accuracy: 0.65238480194,  fbAccu: 0.553609389503, confidence: 0.799855048745:
region 5: 0,5 accuracy: 0.68078994614,  fbAccu: 0.573050069818, confidence: 0.788960360875:
region 6: 0,6 accuracy: 0.647601780122,  fbAccu: 0.547250151091, confidence: 0.78165261081:
region 7: 0,7 accuracy: 0.694501617171,  fbAccu: 0.590055212519, confidence: 0.80920183616:
region 8: 1,0 accuracy: 0.635231788079,  fbAccu: 0.532744665195, confidence: 0.787102429596:
region 9: 1,1 accuracy: 0.608250251532,  fbAccu: 0.503628159395, confidence: 0.751236527141:
10 : total time 24.9809980392 s.
Average training Time:  0.0160507440567
A

KeyboardInterrupt: 

In [14]:
def makeSubmission(resultFolder):
    results=[]
    for f in os.listdir(resultFolder):
        if f.endswith('.rst'):
            fi=resultFolder+'/'+f
            results.append(pd.read_csv(fi))
    results=pd.concat(results)
        
    results.sort(['originalIndex'], inplace=True)
    return results

In [None]:
results=makeSubmission('../submissions/xgb-0.03-50Trees/')
predictions=results[['pred0','pred1','pred2']]
submit=pd.DataFrame()
# submit.loc[:,'row_id']=np.arange(len(predictions))
submit.loc[:,'place_id']=predictions[['pred0', 'pred1', 'pred2']].apply(lambda x: ' '.join([str(nx) for nx in x]), axis=1)
submit.loc[:,'row_id']=np.arange(len(predictions))


In [None]:
submit=submit[['row_id','place_id' ]]
submit.set_index('row_id', inplace=True)
submit.to_csv('../submissions/xgb0.03_50Trees.csv')


In [None]:
p300=pd.read_csv('../submissions/rf300Trees10_250.csv').head()
p50=pd.read_csv('../submissions/rf50Trees10_250.csv').head()
p300
p50

In [None]:
others=pd.read_csv('../submissions/rf_submission_2016-06-03-19-57.csv')

In [None]:
results=makeSubmission('rf-Tue Jun 14 18:18:14 2016')
predictions=results[['pred0','pred1','pred2']].values

realXMin=0
realXMax=10.1
realYMin=0
realYMax=10.1

real=_train_[int(len(_train_)*trainRatio):]

real=real[(real['x']<realXMax) & (real['x']>=realXMin) & (real['y']<realYMax) & (real['y']>=realYMin)]
real=real[['place_id']]

print len(real)
print len(predictions)

# print len(predictions), len(real), results['originalIndex'].value_counts()

totalHit=sum([real['place_id'].iloc[i] in predictions[i] for i in xrange(len(predictions))])
score=0
score+=1.0*sum([real['place_id'].iloc[i] in predictions[i][:1] for i in xrange(len(predictions))])
score+=1.0/2*sum([real['place_id'].iloc[i] in predictions[i][1:2] for i in xrange(len(predictions))])
score+=1.0/3*sum([real['place_id'].iloc[i] in predictions[i][2:3] for i in xrange(len(predictions))])

print 1.0*totalHit/len(predictions)
print score/len(predictions)



In [None]:
real.head()

results[results['originalIndex']==934546]

In [None]:
r=pd.read_csv('rf-Sun Jun 12 12:17:04 2016/0000-0.0-1.0-0.0-0.04.rst')

In [None]:
r.head()