In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import sklearn.preprocessing as preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import roc_auc_score

In [2]:
models = []
nmodels = 5
for i in xrange(nmodels):
    models.append(RandomForestClassifier(n_estimators=800, max_depth=None, min_samples_leaf=1, random_state=i, criterion='entropy') )


In [14]:
train_data = pd.read_csv('train_bidders_info_final.csv')
train_data = train_data.dropna()
train_data.head()

Unnamed: 0,bidder_id,number of bids,avg arrival interval,min interval,number of auctions,number of urls,merchandise,number of devices,number of countries,min reaction time,median reaction time,class
0,060ac64ff78ae62e9bc697192cebea333760p,194,63665394055,1187134502,24,5,home goods,6,4,0,16894736842,1
1,7505bfc1ca67a5ae14e329b46a88ab869ms9g,798,17059763587,2011695906,85,446,jewelry,192,23,0,842105263,1
2,b1863365ae232a316eb946ba56a29415m0gdz,5329,2551307886,70175438,114,845,home goods,240,29,0,894736842,1
3,458c8c4e10e5c1b29e3fbe5ee56d46e8y3fb3,1165,11717444384,2783625731,103,845,sporting goods,166,37,0,52631579,1
4,6533b670e02ccf583288ba7c8f5fb1carmirt,425,21576961271,222222222,39,282,sporting goods,4,3,0,1105263158,1


In [48]:
X_train = train_data.drop('class', 1)
X_train = X_train.drop('bidder_id', 1)
X_train.head()

Unnamed: 0,number of bids,avg arrival interval,min interval,number of auctions,number of urls,merchandise,number of devices,number of countries,min reaction time,median reaction time
0,194,63665394055,1187134502,24,5,home goods,6,4,0,16894736842
1,798,17059763587,2011695906,85,446,jewelry,192,23,0,842105263
2,5329,2551307886,70175438,114,845,home goods,240,29,0,894736842
3,1165,11717444384,2783625731,103,845,sporting goods,166,37,0,52631579
4,425,21576961271,222222222,39,282,sporting goods,4,3,0,1105263158


In [53]:
X_train_final = pd.concat([X_train,
                           pd.get_dummies(X_train.merchandise, prefix='merch')],
                           axis=1)
X_train_final = X_train_final.drop('merchandise', 1)
X_train_final.head()

Unnamed: 0,number of bids,avg arrival interval,min interval,number of auctions,number of urls,number of devices,number of countries,min reaction time,median reaction time,merch_auto parts,merch_books and music,merch_clothing,merch_computers,merch_furniture,merch_home goods,merch_jewelry,merch_mobile,merch_office equipment,merch_sporting goods
0,194,63665394055,1187134502,24,5,6,4,0,16894736842,0,0,0,0,0,1,0,0,0,0
1,798,17059763587,2011695906,85,446,192,23,0,842105263,0,0,0,0,0,0,1,0,0,0
2,5329,2551307886,70175438,114,845,240,29,0,894736842,0,0,0,0,0,1,0,0,0,0
3,1165,11717444384,2783625731,103,845,166,37,0,52631579,0,0,0,0,0,0,0,0,0,1
4,425,21576961271,222222222,39,282,4,3,0,1105263158,0,0,0,0,0,0,0,0,0,1


In [18]:
y_train = train_data.get('class').values
y_train.shape

(1162,)

In [19]:
X_train.shape

(1162, 10)

In [20]:
test_data = pd.read_csv('test_bidders_info_final.csv')
test_data.head()

Unnamed: 0,bidder_id,number of bids,avg arrival interval,min interval,number of auctions,number of urls,merchandise,number of devices,number of countries,min reaction time,median reaction time
0,7d804e4c5a1da2ad391ab5f67a9db4f5jdlj6,46,1700062000000.0,110865500000.0,15,29,sporting goods,35,4,0,4578947000.0
1,c686cbe1d937f81c2b50ec8bd67b9b843hglg,3,36873000000000.0,,3,2,jewelry,3,3,105263158,9526316000.0
2,560eeb674015e1284864fbdf0d0b10bbchfvp,2,91315790000.0,,1,2,mobile,2,1,52631579,78947370.0
3,08078a2526666deea62ca7c065bf22c4h7i8s,1,,,1,1,mobile,1,1,0,0.0
4,8d4cc3988c9cb3b902c07cdd1a68fd4f36cit,1,,,1,1,office equipment,1,1,105263158,105263200.0


In [44]:
X_test = test_data.drop('bidder_id', 1)
X_test.head(10)

Unnamed: 0,number of bids,avg arrival interval,min interval,number of auctions,number of urls,merchandise,number of devices,number of countries,min reaction time,median reaction time
0,46,1700062000000.0,110865500000.0,15,29,sporting goods,35,4,0,4578947000.0
1,3,36873000000000.0,,3,2,jewelry,3,3,105263158,9526316000.0
2,2,91315790000.0,,1,2,mobile,2,1,52631579,78947370.0
3,1,,,1,1,mobile,1,1,0,0.0
4,1,,,1,1,office equipment,1,1,105263158,105263200.0
5,172,450928000000.0,34087720000.0,50,105,office equipment,65,26,0,631578900.0
6,3,2864079000000.0,,1,1,sporting goods,1,1,52631578,421052600.0
7,859,15887440000.0,2538012000.0,140,366,mobile,202,44,0,947368400.0
8,34,400594900000.0,231140400000.0,32,12,jewelry,23,7,0,10763160000.0
9,179,76383500000.0,17953220000.0,59,83,sporting goods,61,7,0,368421100.0


In [54]:
X_test = X_test.fillna(X_test.dropna().median())
X_test.get('min interval').values[0:10]
X_test.head(5)

Unnamed: 0,number of bids,avg arrival interval,min interval,number of auctions,number of urls,merchandise,number of devices,number of countries,min reaction time,median reaction time
0,46,1700062000000.0,110865500000.0,15,29,sporting goods,35,4,0,4578947000.0
1,3,36873000000000.0,15216370000.0,3,2,jewelry,3,3,105263158,9526316000.0
2,2,91315790000.0,15216370000.0,1,2,mobile,2,1,52631579,78947370.0
3,1,206877400000.0,15216370000.0,1,1,mobile,1,1,0,0.0
4,1,206877400000.0,15216370000.0,1,1,office equipment,1,1,105263158,105263200.0


In [55]:
X_test_final = pd.concat([X_test,
                           pd.get_dummies(X_test.merchandise, prefix='merch')],
                           axis=1)
X_test_final = X_test_final.drop('merchandise', 1)
X_test_final.head()

Unnamed: 0,number of bids,avg arrival interval,min interval,number of auctions,number of urls,number of devices,number of countries,min reaction time,median reaction time,merch_books and music,merch_clothing,merch_computers,merch_furniture,merch_home goods,merch_jewelry,merch_mobile,merch_office equipment,merch_sporting goods
0,46,1700062000000.0,110865500000.0,15,29,35,4,0,4578947000.0,0,0,0,0,0,0,0,0,1
1,3,36873000000000.0,15216370000.0,3,2,3,3,105263158,9526316000.0,0,0,0,0,0,1,0,0,0
2,2,91315790000.0,15216370000.0,1,2,2,1,52631579,78947370.0,0,0,0,0,0,0,1,0,0
3,1,206877400000.0,15216370000.0,1,1,1,1,0,0.0,0,0,0,0,0,0,1,0,0
4,1,206877400000.0,15216370000.0,1,1,1,1,105263158,105263200.0,0,0,0,0,0,0,0,1,0


In [67]:
print X_train_final.columns
print X_test_final.columns

X_train_final = X_train_final.drop('merch_auto parts', 1)
X_train_final.shape

Index([u'number of bids', u'avg arrival interval', u'min interval', u'number of auctions', u'number of urls', u'number of devices', u'number of countries', u'min reaction time', u'median reaction time', u'merch_books and music', u'merch_clothing', u'merch_computers', u'merch_furniture', u'merch_home goods', u'merch_jewelry', u'merch_mobile', u'merch_office equipment', u'merch_sporting goods'], dtype='object')
Index([u'number of bids', u'avg arrival interval', u'min interval', u'number of auctions', u'number of urls', u'number of devices', u'number of countries', u'min reaction time', u'median reaction time', u'merch_books and music', u'merch_clothing', u'merch_computers', u'merch_furniture', u'merch_home goods', u'merch_jewelry', u'merch_mobile', u'merch_office equipment', u'merch_sporting goods'], dtype='object')


(1162, 18)

In [77]:
#predict prob

predictions = np.zeros(X_test_final.shape[0])
for i in range(nmodels):
    models[i].fit(X_train_final, y_train)
    curr = models[i].predict_proba(X_test_final)[:,1]
    predictions += curr

predictions = 1.0*predictions/nmodels

predictions.shape

(4630,)

In [80]:
ids = test_data.get('bidder_id')
ids.shape

(4630,)

In [86]:
f = open('prediction.csv', 'wb')
f.write('id,prediction\n')
for i in range(ids.shape[0]):
    currLine = str(ids[i]) + ',' + str(predictions[i]) + '\n'
    f.write(currLine)
f.close()

In [87]:
pred_data = pd.read_csv('prediction.csv')
pred_data.head()

Unnamed: 0,id,prediction
0,7d804e4c5a1da2ad391ab5f67a9db4f5jdlj6,0.0
1,c686cbe1d937f81c2b50ec8bd67b9b843hglg,0.028
2,560eeb674015e1284864fbdf0d0b10bbchfvp,0.22375
3,08078a2526666deea62ca7c065bf22c4h7i8s,0.22175
4,8d4cc3988c9cb3b902c07cdd1a68fd4f36cit,0.15025


In [93]:
#read into dictionary
idToPred = {}
for i in range(ids.shape[0]):
    idToPred[str(ids[i])] = predictions[i]

len(idToPred)
#idToPred.keys()[0]

4630

In [103]:
#final output

import csv

fin = open('test.csv', 'rU')
rdr = csv.reader(fin, delimiter=',')

fout = open('final_prediction.csv', 'wb')

cnt = 0
firstRow = True

for row in rdr:
    if firstRow:
        firstRow = False
        fout.write('bidder_id,prediction\n')
        continue
    cnt = cnt+1
    id = row[0]
    pred = 0.0
    if idToPred.has_key(id):
        pred = idToPred[id]
        #pred = two_digit_value(idToPred[id])
    fout.write(id + ',' + str(pred) + '\n')

fout.close()
fin.close()
    
print(cnt)

4700
