In [1]:
import os

import time
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm as cm
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.covariance import EllipticEnvelope
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [2]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')

places=train['place_id'].unique()
train=train[train['accuracy']>2]
placeCounts=train['place_id'].value_counts()
train=train[(placeCounts[train['place_id']]>10).values]

In [3]:
def featureFactory(dataFrame):
    dataFrame.loc[:,'day']=dataFrame.loc[:, 'time'].div(1440).map(int)
    dataFrame.loc[:,'weekday']=dataFrame.loc[:,'day']%7
    dataFrame.loc[:,'year']=dataFrame.loc[:,'day'].div(365).map(int)
    dataFrame.loc[:,'month']=(dataFrame.loc[:,'day']%365).div(30).map(int)
    dataFrame.loc[:,'hour']=dataFrame.loc[:,'time'].div(60).map(int)
    dataFrame.loc[:, 'hourInDay']=dataFrame.loc[:,'hour']%24
    
    dataFrame.loc[:, 'originalIndex']=xrange(len(dataFrame))

    
featureFactory(train)
featureFactory(test)

In [4]:
nXRegions=50
nYRegions=50

xMin=0
xMax=10.1
yMin=0
yMax=10.1
nXRegions=nXRegions*int(xMax-xMin)/10
nYRegions=nYRegions*int(yMax-yMin)/10
train=train[(train['x']<xMax) & (train['x']>=xMin) & (train['y']<yMax) & (train['y']>=yMin)]
f=['x','y','accuracy', 'day', 'weekday', 'hourInDay', 'month', 'year']

In [None]:
def train_test(modelSerialization=False, resultSerialization=False, model='rf', predictSet=test):
    
    

    if modelSerialization or resultSerialization:
        base='./'
        folderName=model+'-'+time.strftime('%c')
        folderPath=base+folderName+'/'
        if not os.path.exists(folderPath):
            os.makedirs(folderPath)
            
        
    count=0
    accuracies=[]
    FBscores=[]
    
    startTime=time.time()
    trainingTimes=[]
    predictionTimes=[]
    
    testNum=0

    roundNum=5
    for xNum, yNum in ((xn, yn) for xn in xrange(nXRegions) for yn in xrange(nYRegions)):

        xStep=round(1.0*(xMax-xMin)/nXRegions,roundNum)
        xStart=round(xNum*xStep,roundNum)
        xEnd=round(xStart+xStep,roundNum)

        yStep=round(1.0*(yMax-yMin)/nYRegions,roundNum)
        yStart=round(yNum*yStep,roundNum)
        yEnd=round(yStart+yStep,roundNum)

        regionalTrain=train[(train['x']<xEnd) & (train['x']>=xStart) & (train['y']<yEnd) & (train['y']>=yStart)]

        if len(regionalTrain):
            regionalPredictSet=predictSet[(predictSet['x']<xEnd) & (predictSet['x']>=xStart) & (predictSet['y']<yEnd) & (predictSet['y']>=yStart)]
            testNum+=len(regionalPredictSet)
            
            trainingStartTime=time.time()
#             clf=RandomForestClassifier(n_jobs=8, n_estimators=300, max_features=None).fit(regionalTrain[f], regionalTrain['place_id'])   
            clf=RandomForestClassifier(n_jobs=8, n_estimators=300, random_state=0).fit(regionalTrain[f], regionalTrain['place_id'])   
#             clf=XGBClassifier(learning_rate=0.03, n_estimators=50, objective='multi:softprob').fit(regionalTrain[f], regionalTrain['place_id'])

            trainingTimes.append(time.time()-trainingStartTime)
            
            
            if modelSerialization:
                modelFileName='{:04d}-{}-{}-{}-{}.clf'.format(count, xStart, xEnd, yStart, yEnd)
                modelFilePath=folderPath+modelFileName
                with open(modelFilePath, 'ab+') as fo:
                    pickle.dump(clf, fo, pickle.HIGHEST_PROTOCOL)
            
            
            predictionStartTime=time.time()
            prob=clf.predict_proba(regionalPredictSet[f])
            predictionTimes.append(time.time()-predictionStartTime)
            
            pred=[sorted(zip(p, clf.classes_))[-3:][::-1] for p in prob]
            prediction=[zip(*p)[1] for p in pred]
            confidence=[zip(*p)[0] for p in pred]
            
            if resultSerialization:
                resultFileName='{:04d}-{}-{}-{}-{}.rst'.format(count, xStart, xEnd, yStart, yEnd)
                resultFilePath=folderPath+resultFileName
                predColumns=zip(*prediction)
                confColumns=zip(*confidence)

                
                if len(predColumns[2])!=len(prediction):
                    print 'missing values',count, len(predColumns[0])
                
                results=pd.DataFrame({'originalIndex': regionalPredictSet['originalIndex'].tolist(),\
                                      'x':regionalPredictSet['x'].tolist(), 'y':regionalPredictSet['y'].tolist(), \
                                      'accuracy':regionalPredictSet['accuracy'].tolist(), 'pred0':predColumns[0], \
                                      'pred1':predColumns[1], 'pred2':predColumns[2], 'conf0': confColumns[0], \
                                      'conf1': confColumns[1], 'conf2': confColumns[2]})

                results.to_csv(resultFilePath)
                
            if 'place_id' in predictSet.columns:
                regionalAccuracy=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                regionalConfidence=[sum(z)/len(regionalPredictSet) for z in zip(*confidence)]
                regionalConfidence3=sum(regionalConfidence)

                fbAccuracy=0
                fbAccuracy+=1.0*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][:1] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/2*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][1:2] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                fbAccuracy+=1.0/3*sum([regionalPredictSet['place_id'].iloc[i] in prediction[i][2:3] for i in xrange(len(regionalPredictSet))])/len(regionalPredictSet)
                FBscores.append(fbAccuracy)




                print 'region {}: {},{} accuracy: {},  fbAccu: {}, confidence: {}:'.format(count, xNum, yNum, regionalAccuracy, fbAccuracy, regionalConfidence3)
                accuracies.append(regionalAccuracy)



        count+=1
        if count%10==0:

            print '{} : total time {} s.'.format(count, time.time()-startTime)
            print 'Average training Time: ', sum(trainingTimes)/len(trainingTimes)
            print 'Average prediction Time: ', sum(predictionTimes)/len(predictionTimes)
            print 'Average FB Score', np.mean(FBscores)
            print 
            
            startTime=time.time()
            trainingTimes=[]
            predictionTimes=[]


    print 
    print np.mean(accuracies)
    print np.var(accuracies)
    print
    print np.mean(FBscores)
    print testNum
    print 'done'
    
    
    
    

In [None]:
train_test(modelSerialization=False, resultSerialization=True, predictSet=test)

10 : total time 51.8459470272 s.
Average training Time:  3.53897089958
Average prediction Time:  1.06334178448
Average FB Score nan





In [18]:
def makeSubmission(resultFolder):
    results=[]
    for f in os.listdir(resultFolder):
        if f.endswith('.rst'):
            fi=resultFolder+'/'+f
            results.append(pd.read_csv(fi))
    results=pd.concat(results)
        
    results.sort(['originalIndex'], inplace=True)
    return results

In [21]:
results=makeSubmission('../submissions/rf100Trees50-50Fulltraining/')
predictions=results[['pred0','pred1','pred2']]
submit=pd.DataFrame()
# submit.loc[:,'row_id']=np.arange(len(predictions))
submit.loc[:,'place_id']=predictions[['pred0', 'pred1', 'pred2']].apply(lambda x: ' '.join([str(nx) for nx in x]), axis=1)
submit.loc[:,'row_id']=np.arange(len(predictions))




In [22]:
submit=submit[['row_id','place_id' ]]
submit.set_index('row_id', inplace=True)
submit.to_csv('../submissions/rf100Trees50-50Fulltraining.csv')
