In [1]:
import os
import math

import time
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm as cm
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [2]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')

places=train['place_id'].unique()
train=train[train['accuracy']<1000]
placeCounts=train['place_id'].value_counts()

In [3]:
def featureFactory(dataFrame):
    dataFrame.loc[:,'xTimeY']=dataFrame.loc[:, 'x']*dataFrame.loc[:, 'y']
    dataFrame.loc[:,'xDivY']=dataFrame.loc[:, 'x'].div(dataFrame.loc[:, 'y'])


    
    dataFrame.loc[:,'day']=dataFrame.loc[:, 'time'].div(1440).map(int)
    dataFrame.loc[:,'weekday']=dataFrame.loc[:,'day']%7
    dataFrame.loc[:,'weekdayShifted']=(dataFrame.loc[:,'weekday']+3)%7
    dataFrame.loc[:,'dayInMonth']=dataFrame.loc[:, 'day']%(30)
    dataFrame.loc[:,'year']=dataFrame.loc[:,'day'].div(365).map(int)
    
    dataFrame.loc[:,'month']=(dataFrame.loc[:,'day']%365).div(30).map(int)
    dataFrame.loc[:,'hour']=dataFrame.loc[:,'time'].div(60).map(int)
    dataFrame.loc[:, 'hourInDay']=dataFrame.loc[:,'hour']%24
    dataFrame.loc[:, 'hourInDayShifted1']=(dataFrame.loc[:,'hourInDay']+6)%24
    dataFrame.loc[:, 'hourInDayShifted2']=(dataFrame.loc[:,'hourInDay']+12)%24
    dataFrame.loc[:, 'hourInDayShifted3']=(dataFrame.loc[:,'hourInDay']+18)%24

 
    dataFrame.loc[:, 'originalIndex']=xrange(len(dataFrame))

In [4]:
featureFactory(train)
featureFactory(test)

In [5]:
g=train.groupby('place_id')
places=pd.DataFrame()
places['xMean']=g['x'].mean()
places['xStd']=g['x'].std()

places['yMean']=g['y'].mean()
places['yStd']=g['y'].std()

places['hourInDayMean']=g['hourInDay'].mean()
places['hourInDayStd']=g['hourInDay'].std()

places['monthMean']=g['month'].mean()
places['monthStd']=g['month'].std()

places['dayMean']=g['day'].mean()
places['dayStd']=g['day'].std()

places.head()

sTime=time.time()

train=train[(abs(train['y']-places.loc[train['place_id'], 'yMean'].values)<5*places.loc[train['place_id'], 'yStd'].values)\
         & (abs(train['x']-places.loc[train['place_id'], 'xMean'].values)<5*places.loc[train['place_id'], 'xStd'].values)\
           ]


Unnamed: 0_level_0,xMean,xStd,yMean,yStd,hourInDayMean,hourInDayStd,monthMean,monthStd,dayMean,dayStd
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000015801,2.671644,0.208356,5.54916,0.015983,16.371795,5.451341,3.25641,2.902913,154.333333,134.180501
1000017288,7.336637,0.338709,4.346527,0.010262,12.694737,4.141317,4.673684,3.269596,222.210526,139.521591
1000025138,0.991042,0.076025,5.570373,0.010221,12.523979,3.688114,5.062167,3.519853,289.238011,144.162461
1000052096,2.856832,0.474205,5.833412,0.01132,9.726042,3.457918,4.223958,3.140248,265.909375,168.317613
1000063498,4.054855,1.523148,7.566078,0.073799,15.1,4.049273,5.066667,2.489753,427.7,124.677659


In [6]:
nXRegions=20
nYRegions=40
xMin=0
xMax=10.1
yMin=0
yMax=10.1
# nXRegions=nXRegions*int(xMax-xMin)/10
# nYRegions=nYRegions*int(yMax-yMin)/10
# train=train[(train['x']<xMax) & (train['x']>=xMin) & (train['y']<yMax) & (train['y']>=yMin)]
# valid=valid[(valid['x']<xMax) & (valid['x']>=xMin) & (valid['y']<yMax) & (valid['y']>=yMin)]

In [7]:
def train_test(modelSerialization=False, resultSerialization=False, model='rf', predictSet=test, th=0, xMargin=0.05, yMargin=0.025):
    
    if modelSerialization or resultSerialization:
        base='./'
        folderName=model+'-'+time.strftime('%c')
        folderPath=base+folderName+'/'
        if not os.path.exists(folderPath):
            os.makedirs(folderPath)
            
    
#     f=['x','y','xTimeY','xDivY', 'accuracy', 'day', 'weekday', 'hourInDay','hourInDayShifted2', 'month', 'year']
    f=['x','y', 'accuracy', 'day','weekday', 'hourInDay','hourInDayShifted2', 'month', 'year']
        
    count=0
    accuracies=[]
    FBscores=[]
    
    startTime=time.time()
    trainingTimes=[]
    predictionTimes=[]
    
    testNum=0

    roundNum=5

    for xNum, yNum in ((xn, yn) for xn in xrange(nXRegions) for yn in xrange(nYRegions)):

        xStep=round(1.0*(xMax-xMin)/nXRegions,roundNum)
        xStart=round(xMin+xNum*xStep,roundNum)
        xEnd=round(xStart+xStep,roundNum)

        yStep=round(1.0*(yMax-yMin)/nYRegions,roundNum)
        yStart=round(yMin+yNum*yStep,roundNum)
        yEnd=round(yStart+yStep,roundNum)
        
#         regionalTrain=train[(train['x']<xEnd) & (train['x']>=xStart) & (train['y']<yEnd) & (train['y']>=yStart)]

        regionalTrain=train[(train['x']<xEnd+xMargin) & (train['x']>=xStart-xMargin) & (train['y']<yEnd+yMargin) & (train['y']>=yStart-yMargin)]
#         print xStart, xEnd, yStart, yEnd, len(regionalTrain)

        
            
#         regionalTrain['sampleWeight']=


        placesCounts=regionalTrain['place_id'].value_counts()
        regionalTrain=regionalTrain[(placeCounts[regionalTrain['place_id'].values]>th).values]
        
        tail=len(regionalTrain)/6
        latest=regionalTrain.sort_values(by=['time'])[:tail]
        regionalTrain=regionalTrain.append(latest)
        
        
        addition=regionalTrain[regionalTrain['hourInDay']<=3]
        addition['hourInDay']=addition['hourInDay']+24
        regionalTrain=regionalTrain.append(addition)
        
        addition=regionalTrain[regionalTrain['hourInDay']>=21]
        addition['hourInDay']=addition['hourInDay']-24
        regionalTrain=regionalTrain.append(addition)


        
        if len(regionalTrain):
#             regionalPredictSet=predictSet[(predictSet['x']<xEnd-margin) & (predictSet['x']>=xStart+margin) & (predictSet['y']<yEnd-margin) & (predictSet['y']>=yStart+margin)]
            regionalPredictSet=predictSet[(predictSet['x']<xEnd) & (predictSet['x']>=xStart) & (predictSet['y']<yEnd) & (predictSet['y']>=yStart)]

            testNum+=len(regionalPredictSet)

            if model=='knn':
                fWeights={'x': 500, 'y': 1000, 'accuracy':0, 'day': 0,'dayInMonth': 0, 'weekday': 3,\
                          'hourInDay': 4, 'hourInDayShifted2': 0, 'month': 2, 'year': 10}
                for ft in f:
                    regionalTrain[ft]=regionalTrain[ft].values*fWeights[ft]
                    regionalPredictSet[ft]=regionalPredictSet[ft].values*fWeights[ft]
                
            else:
                fMean=regionalTrain[f].mean()
                fStd=regionalTrain[f].std()

                for ft in f:       
                    regionalTrain[ft]=(regionalTrain[ft].values-fMean[ft])/(fStd[ft])
                    regionalPredictSet[ft]=(regionalPredictSet[ft].values-fMean[ft])/(fStd[ft])
#                 print regionalTrain['x'].describe()

      
            le=LabelEncoder()
            regionalTrain['le']=le.fit_transform(regionalTrain['place_id'].values)

#             s_w=1**(15-regionalTrain['day'].values/30.0)
            
            trainingStartTime=time.time()
            if model=='rf':
                s_w_rf=3**(-regionalTrain['day'].values)
#                 s_w_rf=1.65-regionalTrain['day'].values
    #             clf=RandomForestClassifier(n_jobs=-1, n_estimators=300, max_features=None).fit(regionalTrain[f], regionalTrain['le'])   
            
                clf=RandomForestClassifier(n_jobs=-1, n_estimators=220, random_state=0).fit(regionalTrain[f], regionalTrain['le'], sample_weight=s_w_rf)  

            if model=='xgb':
                clf=XGBClassifier(learning_rate=0.02, n_estimators=220, objective='multi:softprob', max_depth=3, seed=0).fit(regionalTrain[f], regionalTrain['le'])
            
            
            if model=='knn':
                clf=KNeighborsClassifier(n_neighbors=25, weights='distance',metric='manhattan', n_jobs=-1).fit(regionalTrain[f], regionalTrain['le'])
         
            trainingTimes.append(time.time()-trainingStartTime)
            
            
            if modelSerialization:
                modelFileName='{:04d}-{}-{}-{}-{}.clf'.format(count, xStart, xEnd, yStart, yEnd)
                modelFilePath=folderPath+modelFileName
                with open(modelFilePath, 'ab+') as fo:
                    pickle.dump(clf, fo, pickle.HIGHEST_PROTOCOL)
            
            
            predictionStartTime=time.time()
            prob=clf.predict_proba(regionalPredictSet[f])
            predictionTimes.append(time.time()-predictionStartTime)
            
            pred=[sorted(zip(p, clf.classes_))[-3:][::-1] for p in prob]
            prediction=le.inverse_transform([zip(*p)[1] for p in pred])
            confidence=[zip(*p)[0] for p in pred]
            
            
                
            fbscoreForSerialization=-1
            if 'place_id' in predictSet.columns:
                regionalAccuracy=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                regionalConfidence=[sum(z)/len(regionalPredictSet) for z in zip(*confidence)]
                regionalConfidence3=sum(regionalConfidence)

                fbAccuracy=0
                fbAccuracy+=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][:1] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/2*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][1:2] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/3*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][2:3] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                FBscores.append(fbAccuracy)
                print 'region {}: {},{} accuracy: {},  fbAccu: {}, confidence: {}:'.format(count, xNum, yNum, regionalAccuracy, fbAccuracy, regionalConfidence3)
                accuracies.append(regionalAccuracy)
                if resultSerialization:
                    fbscoreForSerialization=fbAccuracy
                
            if resultSerialization:
                resultFileName='{:04d}-{}-{}-{}-{}.rst'.format(count, xStart, xEnd, yStart, yEnd)
                resultFilePath=folderPath+resultFileName
                predColumns=zip(*prediction)
                confColumns=zip(*confidence)

                
                if len(predColumns[2])!=len(prediction):
                    print 'missing values',count, len(predColumns[0])
                
                results=pd.DataFrame({'originalIndex': regionalPredictSet['originalIndex'].tolist(),\
                                      'x':regionalPredictSet['x'].tolist(), 'y':regionalPredictSet['y'].tolist(), \
                                      'accuracy':regionalPredictSet['accuracy'].tolist(), 'pred0':predColumns[0], \
                                      'pred1':predColumns[1], 'pred2':predColumns[2], 'conf0': confColumns[0], \
                                      'conf1': confColumns[1], 'conf2': confColumns[2], 'regionalFBScore': [fbscoreForSerialization]*len(regionalPredictSet)})

                results.to_csv(resultFilePath)



        count+=1
        if count%10==0:

            print '{} : total time {} s.'.format(count, time.time()-startTime)
            print 'Average training Time: ', sum(trainingTimes)/len(trainingTimes)
            print 'Average prediction Time: ', sum(predictionTimes)/len(predictionTimes)
            print 'Average FB Score', np.mean(FBscores)
            print 
            
            startTime=time.time()
            trainingTimes=[]
            predictionTimes=[]


    print 
    print np.mean(accuracies)
    print np.var(accuracies)
    print
    print np.mean(FBscores)
    print testNum
    print 'done'
    
    
    
    

In [8]:
train_test(modelSerialization=False, model='knn', resultSerialization=True, predictSet=test, th=0, xMargin=0.05, yMargin=0.025)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


10 : total time 85.3168530464 s.
Average training Time:  6.88978066444
Average prediction Time:  0.835693383217
Average FB Score nan





20 : total time 86.9544138908 s.
Average training Time:  6.97141299248
Average prediction Time:  0.910240840912
Average FB Score nan

30 : total time 77.4730520248 s.
Average training Time:  6.18258111477
Average prediction Time:  0.79344933033
Average FB Score nan

40 : total time 69.498677969 s.
Average training Time:  5.50412442684
Average prediction Time:  0.720944714546
Average FB Score nan

50 : total time 91.4971561432 s.
Average training Time:  7.43871691227
Average prediction Time:  0.925125074387
Average FB Score nan

60 : total time 253.260708809 s.
Average training Time:  16.4027125597
Average prediction Time:  1.64271223545
Average FB Score nan

70 : total time 261.091228962 s.
Average training Time:  13.5731440544
Average prediction Time:  1.23648784161
Average FB Score nan

80 : total time 135.711693048 s.
Average training Time:  10.474893713
Average prediction Time:  1.12666509151
Average FB Score nan

90 : total time 109.000499964 s.
Average training Time:  8.972041749

KeyboardInterrupt: 

In [12]:
def makeSubmission(resultFolder):
    results=[]
    for f in os.listdir(resultFolder):
        if f.endswith('.rst'):
            fi=resultFolder+'/'+f
            results.append(pd.read_csv(fi))
    results=pd.concat(results)
        
    results.sort(['originalIndex'], inplace=True)
    return results

In [13]:
results=makeSubmission('../submissions/xgb-0.03-50Trees/')
predictions=results[['pred0','pred1','pred2']]
submit=pd.DataFrame()
# submit.loc[:,'row_id']=np.arange(len(predictions))
submit.loc[:,'place_id']=predictions[['pred0', 'pred1', 'pred2']].apply(lambda x: ' '.join([str(nx) for nx in x]), axis=1)
submit.loc[:,'row_id']=np.arange(len(predictions))




In [14]:
submit=submit[['row_id','place_id' ]]
submit.set_index('row_id', inplace=True)
submit.to_csv('../submissions/xgb0.03_50Trees.csv')


In [267]:
p300=pd.read_csv('../submissions/rf300Trees10_250.csv').head()
p50=pd.read_csv('../submissions/rf50Trees10_250.csv').head()
p300
p50

Unnamed: 0,row_id,place_id
0,0,8017323210 1985125281 4393146716
1,1,2465239230 5437803702 2921898487
2,2,2516481553 6692804575 7862615088
3,3,7995458948 3243409743 8393706174
4,4,8711861736 2766376680 9619154293


Unnamed: 0,row_id,place_id
0,0,8017323210 1985125281 6131996960
1,1,2465239230 5801740503 4223683383
2,2,2516481553 6692804575 7862615088
3,3,7995458948 3243409743 5345410711
4,4,8711861736 2766376680 9619154293


In [238]:
others=pd.read_csv('../submissions/rf_submission_2016-06-03-19-57.csv')

In [189]:
results=makeSubmission('rf-Tue Jun 14 18:18:14 2016')
predictions=results[['pred0','pred1','pred2']].values

realXMin=0
realXMax=10.1
realYMin=0
realYMax=10.1

real=_train_[int(len(_train_)*trainRatio):]

real=real[(real['x']<realXMax) & (real['x']>=realXMin) & (real['y']<realYMax) & (real['y']>=realYMin)]
real=real[['place_id']]

print len(real)
print len(predictions)

# print len(predictions), len(real), results['originalIndex'].value_counts()

totalHit=sum([real['place_id'].iloc[i] in predictions[i] for i in xrange(len(predictions))])
score=0
score+=1.0*sum([real['place_id'].iloc[i] in predictions[i][:1] for i in xrange(len(predictions))])
score+=1.0/2*sum([real['place_id'].iloc[i] in predictions[i][1:2] for i in xrange(len(predictions))])
score+=1.0/3*sum([real['place_id'].iloc[i] in predictions[i][2:3] for i in xrange(len(predictions))])

print 1.0*totalHit/len(predictions)
print score/len(predictions)





5730846
5730846
0.570895466394
0.494175961688


In [23]:
real.head()

results[results['originalIndex']==934546]

23294733    9851590985
23294779    1983039994
23294813    3915691710
23295061    4751915737
23295121    7372147589
Name: place_id, dtype: int64

Unnamed: 0.1,Unnamed: 0,accuracy,conf0,conf1,conf2,originalIndex,pred0,pred1,pred2,x,y
306,306,65,0.44,0.26,0.2,934546,3137026172,5758936842,8767363147,0.3569,0.84
393,393,65,0.44,0.22,0.06,934546,3137026172,5758936842,8675906672,0.3569,0.84


In [29]:
r=pd.read_csv('rf-Sun Jun 12 12:17:04 2016/0000-0.0-1.0-0.0-0.04.rst')

In [30]:
r.head()

Unnamed: 0.1,Unnamed: 0,accuracy,confidence,prediction,x,y
0,23295372,3,"(0.76000000000000001, 0.14000000000000001, 0.0...","(9516247724, 7965058889, 8813608705)",0.4866,0.0119
1,23295972,169,"(0.28000000000000003, 0.20000000000000001, 0.12)","(4941765890, 1342336464, 2797440100)",0.8878,0.0279
2,23296020,8,"(1.0, 0.0, 0.0)","(1425204074, 9973067176, 9922884570)",0.964,0.0213
3,23297600,77,"(1.0, 0.0, 0.0)","(9019790086, 9973067176, 9922884570)",0.9911,0.0206
4,23299123,16,"(0.32000000000000001, 0.29999999999999999, 0.1...","(2718702529, 9727638738, 7937617850)",0.1835,0.0201
