In [1]:
import pandas as pd
from numpy import log

import plotly.express as px

from skfeature.function.similarity_based import fisher_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
import statistics

In [2]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
bids_df = pd.read_csv('bids.csv')

In [3]:
bids_df.isnull().sum()

bid_id            0
bidder_id         0
auction           0
merchandise       0
device            0
time              0
country        8859
ip                0
url               0
dtype: int64

In [4]:
tmp_df = train_df.merge(bids_df, how='left')
bot_ip = tmp_df[tmp_df['outcome'] == 1.0]['ip'].unique()
lst = bot_ip.tolist()
ip_dct = {ip:1 for ip in lst}

In [5]:
def check_ip(ip_list):
    counter = 0
    l = len(ip_list)
    for ip in ip_list:
        if ip in ip_dct:
            counter += 1
    return counter/l

all_bidder_ip = bids_df.groupby('bidder_id')['ip'].unique().reset_index()
all_bidder_ip['prop_bot_ip'] = all_bidder_ip['ip'].apply(check_ip)
all_bidder_ip.head()

Unnamed: 0,bidder_id,ip,prop_bot_ip
0,001068c415025a009fee375a12cff4fcnht8y,[139.226.147.115],1.0
1,002d229ffb247009810828f648afc2ef593rb,[37.40.254.131],0.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,[21.67.17.162],1.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,"[44.241.8.179, 190.88.89.83, 115.47.140.180]",1.0
4,00486a11dff552c4bd7696265724ff81yeo9v,"[143.118.40.162, 54.212.177.220, 236.63.15.129...",0.6


In [6]:
bids_uniq_df = bids_df.groupby('bidder_id').nunique()
bids_uniq_df.reset_index(inplace=True)
bids_uniq_df['bids_per_url'] = bids_uniq_df['bid_id']/bids_uniq_df['url']
bids_uniq_df['bids_per_auction'] = bids_uniq_df['bid_id']/bids_uniq_df['auction']
bids_uniq_df.head()

Unnamed: 0,bidder_id,bid_id,auction,merchandise,device,time,country,ip,url,bids_per_url,bids_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1,1,1,1,1,1,1,1,1.0,1.0
1,002d229ffb247009810828f648afc2ef593rb,2,1,1,2,2,1,1,1,2.0,2.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1,1,1,1,1,1,1,1.0,1.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,3,1,3,3,1,3,2,1.5,1.0
4,00486a11dff552c4bd7696265724ff81yeo9v,20,13,1,8,20,1,10,7,2.857143,1.538462


In [1]:
tmp = train_df.merge(bids_uniq_df, how='left')
merged_df = tmp.merge(all_bidder_ip, left_on='bidder_id', right_on='bidder_id', how='left')
y = merged_df['outcome']
cols = merged_df[['prop_bot_ip', 'bid_id', 'auction', 'device', 'ip_y', 'url', 'bids_per_url', 'bids_per_auction']]
cols = cols.rename(columns={
    'bid_id':'total_bids',
    'auction':'total_auction',
    'device':'total_device',
    'ip_y':'total_ip',
    'url':'total_url',
})
X = cols
X

NameError: name 'train_df' is not defined

In [9]:
new_test_df = test_df.merge(bids_uniq_df, how='left')
new_test_df = new_test_df.merge(all_bidder_ip, left_on='bidder_id', right_on='bidder_id', how='left')
X_test = new_test_df[['prop_bot_ip', 'bid_id', 'auction', 'device', 'ip_x', 'url', 'bids_per_url', 'bids_per_auction']]
X_test = X_test.rename(columns={
    'bid_id':'total_bids',
    'auction':'total_auction',
    'device':'total_device',
    'ip_x':'total_ip',
    'url':'total_url',
})
X_test.fillna(0, inplace=True)
print(X_test.shape)

(4700, 13)
(4700, 15)
(4700, 8)
(4700, 8)
(4700, 8)


In [10]:
print(X_test.shape)
print(X.shape)
print(y.shape)

(4700, 8)
(1984, 8)
(1984,)


In [11]:
rfc_model = RandomForestClassifier(
    random_state=5,
    n_estimators=800,
    max_depth=15,
    min_samples_split=4,
    min_samples_leaf=3,
    max_features='sqrt',
    max_leaf_nodes=31,
)
rfc_final = rfc_model.fit(X, y)

In [12]:
cbc_model = CatBoostClassifier(
    random_seed=5,
    learning_rate=0.01,
    n_estimators=1000
)
cbc_final = cbc_model.fit(X, y, verbose=False)

In [13]:
gbc_model = GradientBoostingClassifier(
    learning_rate=0.01,
    random_state=5,
    n_estimators=1000,
    min_samples_split=2,
    max_features='sqrt',
    max_leaf_nodes=13
)
gbc_final = gbc_model.fit(X, y)

In [15]:
final_estimators = [('rfc', rfc_final),('cbc', cbc_final),('gbc', gbc_final)]
final_ensemble = VotingClassifier(final_estimators, voting='soft')
final_ensemble_model = final_ensemble.fit(X, y)

ensemble_answer = final_ensemble_model.predict_proba(X_test)[:,1]
output = pd.DataFrame({
    'bidder_id':test_df.bidder_id,
    'prediction':ensemble_answer
})
output.to_csv('test_submission.csv',index=False)

0:	learn: 0.6676917	total: 5.06ms	remaining: 5.06s
1:	learn: 0.6418072	total: 8.91ms	remaining: 4.44s
2:	learn: 0.6173496	total: 12.6ms	remaining: 4.19s
3:	learn: 0.5922983	total: 15.9ms	remaining: 3.95s
4:	learn: 0.5690949	total: 19.3ms	remaining: 3.83s
5:	learn: 0.5457354	total: 22.5ms	remaining: 3.73s
6:	learn: 0.5244417	total: 25.9ms	remaining: 3.68s
7:	learn: 0.5051577	total: 29.3ms	remaining: 3.63s
8:	learn: 0.4848228	total: 32.3ms	remaining: 3.56s
9:	learn: 0.4659876	total: 36.1ms	remaining: 3.57s
10:	learn: 0.4484847	total: 39.7ms	remaining: 3.57s
11:	learn: 0.4308875	total: 43.4ms	remaining: 3.57s
12:	learn: 0.4130641	total: 46ms	remaining: 3.49s
13:	learn: 0.3975475	total: 48.8ms	remaining: 3.44s
14:	learn: 0.3822375	total: 51.5ms	remaining: 3.38s
15:	learn: 0.3669899	total: 54.3ms	remaining: 3.34s
16:	learn: 0.3530489	total: 57.6ms	remaining: 3.33s
17:	learn: 0.3393202	total: 61.4ms	remaining: 3.35s
18:	learn: 0.3257151	total: 64ms	remaining: 3.3s
19:	learn: 0.3139740	total:

190:	learn: 0.0255484	total: 573ms	remaining: 2.42s
191:	learn: 0.0254405	total: 577ms	remaining: 2.43s
192:	learn: 0.0254154	total: 580ms	remaining: 2.42s
193:	learn: 0.0253808	total: 583ms	remaining: 2.42s
194:	learn: 0.0252592	total: 586ms	remaining: 2.42s
195:	learn: 0.0251941	total: 589ms	remaining: 2.41s
196:	learn: 0.0251248	total: 591ms	remaining: 2.41s
197:	learn: 0.0251053	total: 594ms	remaining: 2.41s
198:	learn: 0.0250082	total: 597ms	remaining: 2.4s
199:	learn: 0.0249323	total: 601ms	remaining: 2.4s
200:	learn: 0.0248815	total: 605ms	remaining: 2.4s
201:	learn: 0.0247863	total: 608ms	remaining: 2.4s
202:	learn: 0.0247244	total: 612ms	remaining: 2.4s
203:	learn: 0.0246791	total: 616ms	remaining: 2.4s
204:	learn: 0.0246247	total: 619ms	remaining: 2.4s
205:	learn: 0.0245605	total: 623ms	remaining: 2.4s
206:	learn: 0.0245309	total: 626ms	remaining: 2.4s
207:	learn: 0.0244422	total: 629ms	remaining: 2.39s
208:	learn: 0.0244208	total: 631ms	remaining: 2.39s
209:	learn: 0.0243683

356:	learn: 0.0199632	total: 1.09s	remaining: 1.97s
357:	learn: 0.0199523	total: 1.1s	remaining: 1.97s
358:	learn: 0.0199437	total: 1.1s	remaining: 1.96s
359:	learn: 0.0199265	total: 1.1s	remaining: 1.96s
360:	learn: 0.0199124	total: 1.1s	remaining: 1.95s
361:	learn: 0.0198972	total: 1.1s	remaining: 1.95s
362:	learn: 0.0198949	total: 1.11s	remaining: 1.94s
363:	learn: 0.0198877	total: 1.11s	remaining: 1.94s
364:	learn: 0.0198818	total: 1.12s	remaining: 1.94s
365:	learn: 0.0198687	total: 1.12s	remaining: 1.94s
366:	learn: 0.0198458	total: 1.12s	remaining: 1.94s
367:	learn: 0.0198327	total: 1.13s	remaining: 1.93s
368:	learn: 0.0198191	total: 1.13s	remaining: 1.93s
369:	learn: 0.0198044	total: 1.13s	remaining: 1.93s
370:	learn: 0.0197273	total: 1.14s	remaining: 1.93s
371:	learn: 0.0197246	total: 1.14s	remaining: 1.92s
372:	learn: 0.0197226	total: 1.14s	remaining: 1.92s
373:	learn: 0.0196712	total: 1.14s	remaining: 1.92s
374:	learn: 0.0196645	total: 1.15s	remaining: 1.91s
375:	learn: 0.019

550:	learn: 0.0168341	total: 1.66s	remaining: 1.35s
551:	learn: 0.0168243	total: 1.66s	remaining: 1.35s
552:	learn: 0.0168215	total: 1.67s	remaining: 1.35s
553:	learn: 0.0168014	total: 1.67s	remaining: 1.34s
554:	learn: 0.0167954	total: 1.67s	remaining: 1.34s
555:	learn: 0.0167860	total: 1.67s	remaining: 1.34s
556:	learn: 0.0167851	total: 1.68s	remaining: 1.33s
557:	learn: 0.0167818	total: 1.68s	remaining: 1.33s
558:	learn: 0.0167617	total: 1.68s	remaining: 1.33s
559:	learn: 0.0167506	total: 1.69s	remaining: 1.32s
560:	learn: 0.0167360	total: 1.69s	remaining: 1.32s
561:	learn: 0.0167309	total: 1.69s	remaining: 1.32s
562:	learn: 0.0167217	total: 1.7s	remaining: 1.32s
563:	learn: 0.0167072	total: 1.7s	remaining: 1.31s
564:	learn: 0.0166915	total: 1.7s	remaining: 1.31s
565:	learn: 0.0166905	total: 1.71s	remaining: 1.31s
566:	learn: 0.0166760	total: 1.71s	remaining: 1.3s
567:	learn: 0.0166722	total: 1.71s	remaining: 1.3s
568:	learn: 0.0166645	total: 1.71s	remaining: 1.29s
569:	learn: 0.016

754:	learn: 0.0149614	total: 2.23s	remaining: 723ms
755:	learn: 0.0149587	total: 2.23s	remaining: 720ms
756:	learn: 0.0149479	total: 2.23s	remaining: 717ms
757:	learn: 0.0149417	total: 2.23s	remaining: 714ms
758:	learn: 0.0149357	total: 2.24s	remaining: 711ms
759:	learn: 0.0149248	total: 2.24s	remaining: 708ms
760:	learn: 0.0149217	total: 2.25s	remaining: 706ms
761:	learn: 0.0149122	total: 2.25s	remaining: 703ms
762:	learn: 0.0149066	total: 2.25s	remaining: 700ms
763:	learn: 0.0149024	total: 2.26s	remaining: 697ms
764:	learn: 0.0148847	total: 2.26s	remaining: 694ms
765:	learn: 0.0148794	total: 2.26s	remaining: 692ms
766:	learn: 0.0148734	total: 2.27s	remaining: 689ms
767:	learn: 0.0148664	total: 2.27s	remaining: 685ms
768:	learn: 0.0148654	total: 2.27s	remaining: 682ms
769:	learn: 0.0148648	total: 2.27s	remaining: 679ms
770:	learn: 0.0148518	total: 2.27s	remaining: 676ms
771:	learn: 0.0148484	total: 2.28s	remaining: 673ms
772:	learn: 0.0148464	total: 2.28s	remaining: 670ms
773:	learn: 

968:	learn: 0.0137765	total: 2.8s	remaining: 89.5ms
969:	learn: 0.0137728	total: 2.8s	remaining: 86.6ms
970:	learn: 0.0137693	total: 2.8s	remaining: 83.7ms
971:	learn: 0.0137665	total: 2.81s	remaining: 80.8ms
972:	learn: 0.0137662	total: 2.81s	remaining: 78ms
973:	learn: 0.0137621	total: 2.81s	remaining: 75.1ms
974:	learn: 0.0137583	total: 2.81s	remaining: 72.2ms
975:	learn: 0.0137569	total: 2.82s	remaining: 69.3ms
976:	learn: 0.0137566	total: 2.82s	remaining: 66.4ms
977:	learn: 0.0137563	total: 2.82s	remaining: 63.5ms
978:	learn: 0.0137554	total: 2.83s	remaining: 60.7ms
979:	learn: 0.0137547	total: 2.83s	remaining: 57.8ms
980:	learn: 0.0137543	total: 2.83s	remaining: 54.9ms
981:	learn: 0.0137540	total: 2.84s	remaining: 52ms
982:	learn: 0.0137509	total: 2.84s	remaining: 49.1ms
983:	learn: 0.0137507	total: 2.84s	remaining: 46.2ms
984:	learn: 0.0137454	total: 2.84s	remaining: 43.3ms
985:	learn: 0.0137329	total: 2.85s	remaining: 40.4ms
986:	learn: 0.0137325	total: 2.85s	remaining: 37.5ms
