In [256]:
import os

import time
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm as cm
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.covariance import EllipticEnvelope
import xgboost as xgb

import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [257]:
_train_=pd.read_csv('../input/train.csv')
_train_=_train_[_train_['accuracy']>2]
places=_train_['place_id'].unique()

_train_.sort(['time'], inplace=True)
trainRatio=0.8
train=_train_[:int(len(_train_)*trainRatio)]
valid=_train_[int(len(_train_)*trainRatio):]



In [258]:
def featureFactory(dataFrame):
    dataFrame.loc[:,'day']=dataFrame.loc[:, 'time'].div(1440).map(int)
    dataFrame.loc[:,'weekday']=dataFrame.loc[:,'day']%7
    dataFrame.loc[:,'year']=dataFrame.loc[:,'day'].div(365).map(int)
    dataFrame.loc[:,'month']=(dataFrame.loc[:,'day']%365).div(30).map(int)
    dataFrame.loc[:,'hour']=dataFrame.loc[:,'time'].div(60).map(int)
    dataFrame.loc[:, 'hourInDay']=dataFrame.loc[:,'hour']%24
    
    dataFrame.loc[:, 'originalIndex']=xrange(len(dataFrame))

    
featureFactory(train)
featureFactory(valid)


In [259]:
test=pd.read_csv('../input/test.csv')
featureFactory(test)

In [260]:
featureFactory(test)

In [268]:
nXRegions=50
nYRegions=50

xMin=0
xMax=10.1
yMin=0
yMax=10.1
nXRegions=nXRegions*int(xMax-xMin)/10
nYRegions=nYRegions*int(yMax-yMin)/10
train=train[(train['x']<xMax) & (train['x']>=xMin) & (train['y']<yMax) & (train['y']>=yMin)]
valid=valid[(valid['x']<xMax) & (valid['x']>=xMin) & (valid['y']<yMax) & (valid['y']>=yMin)]


f=['x','y','accuracy', 'day', 'weekday', 'hourInDay']
print len(valid)

5730846


In [196]:
# def regionalForestPred(pred, classes, regionalPoints, data):

    
#     prob=[sorted(zip(p, classes))[-3:][::-1] for p in pred]
    
    
#     regionalPred=[]
#     for i in xrange(len(data)):
#         sampleY=data['y'].iloc[i]
#         sampleX=data['x'].iloc[i]
#         sampleDay=data['day'].iloc[i]

#         filteredPlaces=filter(lambda x: \
#                               sampleY<=yMaxMap[x[1]]+yAllowance and sampleY>=yMinMap[x[1]]-yAllowance and \
#                               sampleX<=xMaxMap[x[1]] and sampleX>=xMinMap[x[1]] and \
#                               sampleDay<=dayMaxMap[x[1]] and sampleDay>=dayMinMap[x[1]], prob[i])
#         if filteredPlaces:
#             filteredPlaces=map(lambda x: [x[0]*regionalPoints, x[1]], filteredPlaces)
#         regionalPred.append(filteredPlaces)
#     return regionalPred

In [271]:
def train_test(modelSerialization=False, resultSerialization=False, model='rf', predictSet=valid):
    
    

    if modelSerialization or resultSerialization:
        base='./'
        folderName=model+'-'+time.strftime('%c')
        folderPath=base+folderName+'/'
        if not os.path.exists(folderPath):
            os.makedirs(folderPath)
            
        
    count=0
    accuracies=[]
    FBscores=[]
    
    startTime=time.time()
    trainingTimes=[]
    predictionTimes=[]
    
    testNum=0

    roundNum=5
    for xNum, yNum in ((xn, yn) for xn in xrange(nXRegions) for yn in xrange(nYRegions)):

        xStep=round(1.0*(xMax-xMin)/nXRegions,roundNum)
        xStart=round(xNum*xStep,roundNum)
        xEnd=round(xStart+xStep,roundNum)

        yStep=round(1.0*(yMax-yMin)/nYRegions,roundNum)
        yStart=round(yNum*yStep,roundNum)
        yEnd=round(yStart+yStep,roundNum)

        regionalTrain=train[(train['x']<xEnd) & (train['x']>=xStart) & (train['y']<yEnd) & (train['y']>=yStart)]

        if len(regionalTrain):
            regionalPredictSet=predictSet[(predictSet['x']<xEnd) & (predictSet['x']>=xStart) & (predictSet['y']<yEnd) & (predictSet['y']>=yStart)]
            testNum+=len(regionalPredictSet)
            
            trainingStartTime=time.time()
            clf=RandomForestClassifier(n_jobs=8, n_estimators=300, max_features=None).fit(regionalTrain[f], regionalTrain['place_id'])   
            trainingTimes.append(time.time()-trainingStartTime)
            
            
            if modelSerialization:
                modelFileName='{:04d}-{}-{}-{}-{}.clf'.format(count, xStart, xEnd, yStart, yEnd)
                modelFilePath=folderPath+modelFileName
                with open(modelFilePath, 'ab+') as fo:
                    pickle.dump(clf, fo, pickle.HIGHEST_PROTOCOL)
            
            
            predictionStartTime=time.time()
            prob=clf.predict_proba(regionalPredictSet[f])
            predictionTimes.append(time.time()-predictionStartTime)
            
            pred=[sorted(zip(p, clf.classes_))[-3:][::-1] for p in prob]
            prediction=[zip(*p)[1] for p in pred]
            confidence=[zip(*p)[0] for p in pred]
            
            if resultSerialization:
                resultFileName='{:04d}-{}-{}-{}-{}.rst'.format(count, xStart, xEnd, yStart, yEnd)
                resultFilePath=folderPath+resultFileName
                predColumns=zip(*prediction)
                confColumns=zip(*confidence)

                
                if len(predColumns[2])!=len(prediction):
                    print 'missing values',count, len(predColumns[0])
                
                results=pd.DataFrame({'originalIndex': regionalPredictSet['originalIndex'].tolist(),\
                                      'x':regionalPredictSet['x'].tolist(), 'y':regionalPredictSet['y'].tolist(), \
                                      'accuracy':regionalPredictSet['accuracy'].tolist(), 'pred0':predColumns[0], \
                                      'pred1':predColumns[1], 'pred2':predColumns[2], 'conf0': confColumns[0], \
                                      'conf1': confColumns[1], 'conf2': confColumns[2]})

                results.to_csv(resultFilePath)
                
            if 'place_id' in predictSet.columns:
                regionalAccuracy=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                regionalConfidence=[sum(z)/len(regionalPredictSet) for z in zip(*confidence)]
                regionalConfidence3=sum(regionalConfidence)

                fbAccuracy=0
                fbAccuracy+=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][:1] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/2*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][1:2] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/3*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][2:3] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                FBscores.append(fbAccuracy)




                print 'region {}: {},{} accuracy: {},  fbAccu: {}, confidence: {}:'.format(count, xNum, yNum, regionalAccuracy, fbAccuracy, regionalConfidence3)
                accuracies.append(regionalAccuracy)



        count+=1
        if count%10==0:

            print '{} : total time {} s.'.format(count, time.time()-startTime)
            print 'Average training Time: ', sum(trainingTimes)/len(trainingTimes)
            print 'Average prediction Time: ', sum(predictionTimes)/len(predictionTimes)
            print 'Average FB Score', np.mean(FBscores)
            print 
            
            startTime=time.time()
            trainingTimes=[]
            predictionTimes=[]


    print 
    print np.mean(accuracies)
    print np.var(accuracies)
    print
    print np.mean(FBscores)
    print testNum
    print 'done'
    
    
    
    

In [None]:
train_test(modelSerialization=False, resultSerialization=True, predictSet=test)

In [None]:
def makeSubmission(resultFolder):
    results=[]
    for f in os.listdir(resultFolder):
        if f.endswith('.rst'):
            fi=resultFolder+'/'+f
            results.append(pd.read_csv(fi))
    results=pd.concat(results)
        
    results.sort(['originalIndex'], inplace=True)
    return results

In [264]:
results=makeSubmission('rf300TreesTest')
predictions=results[['pred0','pred1','pred2']]
submit=pd.DataFrame()
# submit.loc[:,'row_id']=np.arange(len(predictions))
submit.loc[:,'place_id']=predictions[['pred0', 'pred1', 'pred2']].apply(lambda x: ' '.join([str(nx) for nx in x]), axis=1)
submit.loc[:,'row_id']=np.arange(len(predictions))




In [265]:
submit=submit[['row_id','place_id' ]]
submit.set_index('row_id', inplace=True)
submit.to_csv('../submissions/rf300Trees10_250.csv')


In [267]:
p300=pd.read_csv('../submissions/rf300Trees10_250.csv').head()
p50=pd.read_csv('../submissions/rf50Trees10_250.csv').head()
p300
p50

Unnamed: 0,row_id,place_id
0,0,8017323210 1985125281 4393146716
1,1,2465239230 5437803702 2921898487
2,2,2516481553 6692804575 7862615088
3,3,7995458948 3243409743 8393706174
4,4,8711861736 2766376680 9619154293


Unnamed: 0,row_id,place_id
0,0,8017323210 1985125281 6131996960
1,1,2465239230 5801740503 4223683383
2,2,2516481553 6692804575 7862615088
3,3,7995458948 3243409743 5345410711
4,4,8711861736 2766376680 9619154293


In [238]:
others=pd.read_csv('../submissions/rf_submission_2016-06-03-19-57.csv')

In [189]:
results=makeSubmission('rf-Tue Jun 14 18:18:14 2016')
predictions=results[['pred0','pred1','pred2']].values

realXMin=0
realXMax=10.1
realYMin=0
realYMax=10.1

real=_train_[int(len(_train_)*trainRatio):]

real=real[(real['x']<realXMax) & (real['x']>=realXMin) & (real['y']<realYMax) & (real['y']>=realYMin)]
real=real[['place_id']]

print len(real)
print len(predictions)

# print len(predictions), len(real), results['originalIndex'].value_counts()

totalHit=sum([real['place_id'].iloc[i] in predictions[i] for i in xrange(len(predictions))])
score=0
score+=1.0*sum([real['place_id'].iloc[i] in predictions[i][:1] for i in xrange(len(predictions))])
score+=1.0/2*sum([real['place_id'].iloc[i] in predictions[i][1:2] for i in xrange(len(predictions))])
score+=1.0/3*sum([real['place_id'].iloc[i] in predictions[i][2:3] for i in xrange(len(predictions))])

print 1.0*totalHit/len(predictions)
print score/len(predictions)





5730846
5730846
0.570895466394
0.494175961688


In [23]:
real.head()

results[results['originalIndex']==934546]

23294733    9851590985
23294779    1983039994
23294813    3915691710
23295061    4751915737
23295121    7372147589
Name: place_id, dtype: int64

Unnamed: 0.1,Unnamed: 0,accuracy,conf0,conf1,conf2,originalIndex,pred0,pred1,pred2,x,y
306,306,65,0.44,0.26,0.2,934546,3137026172,5758936842,8767363147,0.3569,0.84
393,393,65,0.44,0.22,0.06,934546,3137026172,5758936842,8675906672,0.3569,0.84


In [29]:
r=pd.read_csv('rf-Sun Jun 12 12:17:04 2016/0000-0.0-1.0-0.0-0.04.rst')

In [30]:
r.head()

Unnamed: 0.1,Unnamed: 0,accuracy,confidence,prediction,x,y
0,23295372,3,"(0.76000000000000001, 0.14000000000000001, 0.0...","(9516247724, 7965058889, 8813608705)",0.4866,0.0119
1,23295972,169,"(0.28000000000000003, 0.20000000000000001, 0.12)","(4941765890, 1342336464, 2797440100)",0.8878,0.0279
2,23296020,8,"(1.0, 0.0, 0.0)","(1425204074, 9973067176, 9922884570)",0.964,0.0213
3,23297600,77,"(1.0, 0.0, 0.0)","(9019790086, 9973067176, 9922884570)",0.9911,0.0206
4,23299123,16,"(0.32000000000000001, 0.29999999999999999, 0.1...","(2718702529, 9727638738, 7937617850)",0.1835,0.0201
