In [32]:
import pandas as pd
import numpy as np
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [14]:
def produceProbSummary(resultFolder):
    '''
    produce a summary of prediction probilities from a folder of srt files (csv format)
    input:
    
    resultFolder: string (folder in ../submissions/)
    
    output:
    
    results: pandas DataFrame containing:
        row_id
        pred0
        pred1
        pred2
        conf0
        conf1
        conf2
    
    
    '''
    resultFolder='../submissions/'+resultFolder
    
    results=[]
    for f in os.listdir(resultFolder):
        if f.endswith('.rst'):
            fi=resultFolder+'/'+f
            results.append(pd.read_csv(fi))
    results=pd.concat(results)
    results.drop(['accuracy','regionalFBScore'], inplace=True)
    results.sort_values(by=['originalIndex'], inplace=True)
    return results

In [15]:
produceProbSummary('knn-01').head()

Unnamed: 0.1,Unnamed: 0,accuracy,conf0,conf1,conf2,originalIndex,pred0,pred1,pred2,regionalFBScore,x,y
0,0,20.293838,0.316522,0.169611,0.159732,0,4393146716,8017323210,6131996960,-1,82.075,1333.584
0,0,15.44068,0.289022,0.089012,0.082195,1,2465239230,5801740503,9801651394,-1,3621.541,2479.498
0,0,17.923917,0.468894,0.288894,0.115907,2,2516481553,7862615088,7295979691,-1,3967.922,2300.354
0,0,17.923917,0.746153,0.110147,0.07901,3,7995458948,3243409743,8393706174,-1,489.51,1037.918
0,0,16.0206,0.368029,0.270289,0.148313,4,4764406629,8277155346,8711861736,-1,326.83,9530.892


In [27]:
def combineRecords(folders):
    print 'reading folders'
    results=[produceProbSummary(folder)[:1000] for folder in folders]
    print 'reading is done.  Analyzing files'
    if len(list(set([len(r) for r in results])))!=1:
        print "unequal number of records in folders, aborting..."
        return
    
    places=['pred0', 'pred1', 'pred2']
    confs=['conf0', 'conf1', 'conf2']
    weights=[100, 10, 1]
    place_confs=zip(places, confs, weights)
    
    ranks=[]
    print 'making ranks'
    for i in xrange(len(results[0])):
        scores=dict()
        for r in results:
            rRec=r.iloc[i]
            for p, c, w in place_confs:
                if rRec[p] in scores:
                    scores[rRec[p]]+=rRec[c]*w
                else:
                    scores[rRec[p]]=rRec[c]*w
        scores=sorted([[s,p] for p, s in scores.items()])[-3:][::-1]
        ranks.append(' '.join([str(s[1]) for s in scores]))
        if i%1000000==0:
            print i, 'lines are done'
    
    res=pd.DataFrame({'row_id': xrange(len(results[0])), 'place_id': ranks})
    print 'all done'
    return res

In [28]:
res=combineRecords(['knn-01', 'knn-02'])

reading folders
reading is done.  Analyzing files
making ranks
0 lines are done
all done


In [29]:
res.head()

Unnamed: 0,place_id,row_id
0,4393146716.0 6131996960.0 8017323210.0,0
1,2465239230.0 5801740503.0 4634106612.0,1
2,2516481553.0 7862615088.0 7295979691.0,2
3,7995458948.0 3243409743.0 8393706174.0,3
4,4764406629.0 8277155346.0 8711861736.0,4


In [31]:
f1=pd.read_csv('../submissions/knn-01.csv')
f2=pd.read_csv('../submissions/knn-02.csv')

In [33]:
f1.head()
f2.head()

Unnamed: 0,row_id,place_id
0,0,4393146716 8017323210 6131996960
1,1,2465239230 5801740503 9801651394
2,2,2516481553 7862615088 7295979691
3,3,7995458948 3243409743 8393706174
4,4,4764406629 8277155346 8711861736


Unnamed: 0,row_id,place_id
0,0,4393146716 6131996960 8017323210
1,1,2465239230 4634106612 5801740503
2,2,2516481553 7862615088 7295979691
3,3,7995458948 3243409743 8643187406
4,4,4764406629 8277155346 8711861736
