Import Dataset
===
***
Human or Robot Dataset (Facebook Recruitment IV) from Kaggle: <br>
https://www.kaggle.com/c/facebook-recruiting-iv-human-or-bot

In [1]:
import pandas as pd

bids_df = pd.read_csv('data/bids.csv')
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

## Visualise data from datasets

Using <code>.shape</code> and <code>.head</code>

In [2]:
print(train_df.shape)
print(test_df.shape)
print(bids_df.shape)

(2013, 4)
(4700, 3)
(7656334, 9)


In [3]:
train_df.head()

Unnamed: 0,bidder_id,payment_account,address,outcome
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0
1,624f258b49e77713fc34034560f93fb3hu3jo,a3d2de7675556553a5f08e4c88d2c228v1sga,ae87054e5a97a8f840a3991d12611fdcrfbq3,0.0
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,a3d2de7675556553a5f08e4c88d2c2280cybl,92520288b50f03907041887884ba49c0cl0pd,0.0
3,4bee9aba2abda51bf43d639013d6efe12iycd,51d80e233f7b6a7dfdee484a3c120f3b2ita8,4cb9717c8ad7e88a9a284989dd79b98dbevyi,0.0
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,a3d2de7675556553a5f08e4c88d2c22857ddh,2a96c3ce94b3be921e0296097b88b56a7x1ji,0.0


In [4]:
test_df.head()

Unnamed: 0,bidder_id,payment_account,address
0,49bb5a3c944b8fc337981cc7a9ccae41u31d7,a3d2de7675556553a5f08e4c88d2c228htx90,5d9fa1b71f992e7c7a106ce4b07a0a754le7c
1,a921612b85a1494456e74c09393ccb65ylp4y,a3d2de7675556553a5f08e4c88d2c228rs17i,a3d2de7675556553a5f08e4c88d2c228klidn
2,6b601e72a4d264dab9ace9d7b229b47479v6i,925381cce086b8cc9594eee1c77edf665zjpl,a3d2de7675556553a5f08e4c88d2c228aght0
3,eaf0ed0afc9689779417274b4791726cn5udi,a3d2de7675556553a5f08e4c88d2c228nclv5,b5714de1fd69d4a0d2e39d59e53fe9e15vwat
4,cdecd8d02ed8c6037e38042c7745f688mx5sf,a3d2de7675556553a5f08e4c88d2c228dtdkd,c3b363a3c3b838d58c85acf0fc9964cb4pnfa


In [5]:
bids_df.head()

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,home goods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3
3,3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,jefix,jewelry,phone4,9759243157894736,in,18.99.175.133,vasstdc27m7nks3
4,4,8393c48eaf4b8fa96886edc7cf27b372dsibi,jefix,jewelry,phone5,9759243157894736,in,145.138.5.37,vasstdc27m7nks3


2.Feature Engineering
===
***

In [6]:
import numpy as np

bidder_counts = bids_df.groupby("bidder_id")['time'].count().reset_index()
bidder_counts = bidder_counts.rename(columns={'time':'num_bids'})
bidder_counts['log_num_bids'] = (bidder_counts['num_bids']+1).transform(np.log)
bidder_counts

Unnamed: 0,bidder_id,num_bids,log_num_bids
0,001068c415025a009fee375a12cff4fcnht8y,1,0.693147
1,002d229ffb247009810828f648afc2ef593rb,2,1.098612
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,0.693147
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1.386294
4,00486a11dff552c4bd7696265724ff81yeo9v,20,3.044522
...,...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,25075,10.129666
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,22,3.135494
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1,0.693147
6612,ffd62646d600b759a985d45918bd6f0431vmz,664,6.499787


In [7]:
from statistics import mean , median 
from scipy.stats import entropy

entropy_ips_per_auc = bids_df.groupby("bidder_id")

def ips_entropy_auc(group):

    auc_dict = {}
    ip_list = []
    for row_index, row in group.iterrows():
        ip = row['ip']
        auction = row['auction']
        
        if ip not in ip_list:
            ip_list.append(ip)
        
        if auction not in auc_dict:
            auc_dict[auction] = [ip]
        elif ip not in auc_dict[auction]:
            auc_dict[auction] += [ip]
    
    ip_count_per_auc = []
    for ips in auc_dict.values():
        ip_count_per_auc.append(len(ips))
    
    ip_percent_list = []
    
    for ip_count in ip_count_per_auc:
        
        percent = ip_count/len(ip_list)   
        ip_percent_list.append(percent)
    
#     print(ip_percent_list)
    bid_entropy = entropy(ip_percent_list)

    return bid_entropy

# def time_diff_func(group):
    
#     for index in group.time:
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

ips_entropy_per_auc = entropy_ips_per_auc.apply(ips_entropy_auc)

In [8]:
ips_entropy_per_auc = pd.DataFrame(ips_entropy_per_auc)
ips_entropy_per_auc = ips_entropy_per_auc.rename(columns={0:'ips_entropy_per_auc'})
ips_entropy_per_auc = ips_entropy_per_auc.reset_index()
ips_entropy_per_auc['log_ips_entropy_per_auc'] = (ips_entropy_per_auc['ips_entropy_per_auc']+1).transform(np.log)
ips_entropy_per_auc

Unnamed: 0,bidder_id,ips_entropy_per_auc,log_ips_entropy_per_auc
0,001068c415025a009fee375a12cff4fcnht8y,0.000000,0.000000
1,002d229ffb247009810828f648afc2ef593rb,0.000000,0.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,0.000000,0.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.098612,0.741276
4,00486a11dff552c4bd7696265724ff81yeo9v,2.484367,1.248286
...,...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,4.310895,1.669760
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,2.622996,1.287301
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,0.000000,0.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,3.556497,1.516554


In [9]:
from statistics import mean , median 
from scipy.stats import entropy

entropy_auc = bids_df.groupby("bidder_id")

def bid_entropy_per_auc(group):

    auc_dict = {}
    total_bids = 0
    for row_index, row in group.iterrows():
        total_bids += 1
        auction = row['auction']
        
        if auction not in auc_dict:
            auc_dict[auction] = 1
        else:
            auc_dict[auction] += 1
            
    auction_percent_list = []
    
    for auc_bids in auc_dict.values():
        
        percent = auc_bids/total_bids   
        auction_percent_list.append(percent)
    
#     print(ip_percent_list)
    bid_entropy = entropy(auction_percent_list)

    return bid_entropy

# def time_diff_func(group):
    
#     for index in group.time:
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

auc_entropy = entropy_auc.apply(bid_entropy_per_auc)

In [10]:
bids_entropy_auc = pd.DataFrame(auc_entropy)
bids_entropy_auc = bids_entropy_auc.rename(columns={0:'bids_entropy_auc'})
bids_entropy_auc = bids_entropy_auc.reset_index()
bids_entropy_auc['log_bids_entropy_auc'] = (bids_entropy_auc['bids_entropy_auc']+1).transform(np.log)


In [11]:
from statistics import mean , median 
from scipy.stats import entropy

entropy_country = bids_df.groupby("bidder_id")

def bid_entropy_per_country(group):

    coun_dict = {}
    total_bids = 0
    for row_index, row in group.iterrows():
        total_bids += 1
        country = row['country']
        
        if country not in coun_dict:
            coun_dict[country] = 1
        else:
            coun_dict[country] += 1
            
    country_percent_list = []
    
    for url_bids in coun_dict.values():
        
        percent = url_bids/total_bids   
        country_percent_list.append(percent)
    
#     print(ip_percent_list)
    bid_entropy = entropy(country_percent_list)

    return bid_entropy

# def time_diff_func(group):
    
#     for index in group.time:
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

grace = entropy_country.apply(bid_entropy_per_country)

In [12]:
bids_entropy_coun = pd.DataFrame(grace)
bids_entropy_coun = bids_entropy_coun.rename(columns={0:'bids_entropy_country'})
bids_entropy_coun = bids_entropy_coun.reset_index()
bids_entropy_coun_diff = train_df[['bidder_id','outcome']].merge(bids_entropy_coun,on = 'bidder_id',how='left' )
bids_entropy_coun_diff

Unnamed: 0,bidder_id,outcome,bids_entropy_country
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,1.337882
1,624f258b49e77713fc34034560f93fb3hu3jo,0.0,0.000000
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0,0.000000
3,4bee9aba2abda51bf43d639013d6efe12iycd,0.0,0.000000
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0,0.038969
...,...,...,...
2008,369515b3af4f8ca582f90271d30b14b6r52aw,0.0,0.774507
2009,f939c17ffc7c39ac9b35b69e5e75179fv9pe2,0.0,0.000000
2010,c806dbb2decba0ed3c4ff5e2e60a74c2wjvbl,0.0,0.000000
2011,0381a69b7a061e9ace2798fd48f1f537mgq57,0.0,0.000000


In [13]:
bids_entropy_coun['log_bids_entropy_country'] = (bids_entropy_coun['bids_entropy_country']+1).transform(np.log)


In [14]:
bids_entropy_coun_diff[same_response_diff['outcome'] == 1].mean()

NameError: name 'same_response_diff' is not defined

In [None]:
from statistics import mean , median 
from scipy.stats import entropy

entropy_url = bids_df.groupby("bidder_id")

def bid_entropy_per_ip(group):

    url_dict = {}
    total_bids = 0
    for row_index, row in group.iterrows():
        total_bids += 1
        url = row['url']
        
        if url not in url_dict:
            url_dict[url] = 1
        else:
            url_dict[url] += 1
            
            
    url_percent_list = []
    
    for url_bids in url_dict.values():
        
        percent = url_bids/total_bids   
        url_percent_list.append(percent)
    
#     print(ip_percent_list)
    bid_entropy = entropy(url_percent_list)

    return bid_entropy

# def time_diff_func(group):
    
#     for index in group.time:
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

hope = entropy_url.apply(bid_entropy_per_ip)
hope

In [None]:
bids_entropy_url = pd.DataFrame(hope)
bids_entropy_url = bids_entropy_url.rename(columns={0:'bids_entropy_per_url'})
bids_entropy_url = bids_entropy_url.reset_index()
bids_entropy_url_diff = train_df[['bidder_id','outcome']].merge(bids_entropy_url,on = 'bidder_id',how='left' )
bids_entropy_url_diff

In [None]:
bids_entropy_url['log_bids_entropy_url'] = (bids_entropy_url['bids_entropy_per_url']+1).transform(np.log)


In [None]:
from statistics import mean , median 
from scipy.stats import entropy

entropy_ip = bids_df.groupby("bidder_id")

def bid_entropy_per_url(group):

    ip_dict = {}
    total_bids = 0
    for row_index, row in group.iterrows():
        total_bids += 1
        ip = row['ip']
        
        if ip not in ip_dict:
            ip_dict[ip] = 1
        else:
            ip_dict[ip] += 1
            
            
    ip_percent_list = []
    
    for ip_bids in ip_dict.values():
        
        percent = ip_bids/total_bids   
        ip_percent_list.append(percent)
    
#     print(ip_percent_list)
    bid_entropy = entropy(ip_percent_list)

    return bid_entropy

# def time_diff_func(group):
    
#     for index in group.time:
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

bless = entropy_ip.apply(bid_entropy_per_url)
bless

In [None]:
bids_entropy_ip = pd.DataFrame(bless)
bids_entropy_ip = bids_entropy_ip.rename(columns={0:'bids_entropy_per_ip'})
bids_entropy_ip = bids_entropy_ip.reset_index()
bids_entropy_ip

In [None]:
bids_entropy_ip['log_bids_entropy_ip'] = (bids_entropy_ip['bids_entropy_per_ip']+1).transform(np.log)


In [None]:
bids_entropy_ip_diff = train_df[['bidder_id','outcome']].merge(bids_entropy_ip_diff,on = 'bidder_id',how='left' )
bids_entropy_ip_diff

In [None]:
instant_response_counts = bids_df.groupby("bidder_id")

def func(group):
    mode = (group.time).mode
    time_dict = {}
    for time in group.time:
        if time not in time_dict:
            time_dict[time] = 1
        else:
            time_dict[time] += 1
    return max(time_dict.values())
    

In [None]:
instant_response = instant_response_counts.apply(func)

In [None]:
same_response_counts = pd.DataFrame(instant_response)
same_response_counts = same_response_counts.rename(columns={0:'num_instant_counts'})
same_response_counts = same_response_counts.reset_index()
same_response_counts

In [None]:
same_response_counts['log_counts'] = (same_response_counts['num_instant_counts']+1).transform(np.log)
same_response_counts

In [None]:
same_response_diff = train_df[['bidder_id','outcome']].merge(same_response_counts,on = 'bidder_id',how='left' )
same_response_diff[same_response_diff['outcome'] == 0].mean()

In [None]:
same_response_diff[same_response_diff['outcome'] == 1].mean()

In [None]:
from statistics import mean , median 

response_difference_per_auc = bids_df.groupby("bidder_id")



def response_difference(group):

    time_dict = {}
    for row_index, row in group.iterrows():
        auction = row['auction']
        time = row['time']
        
        if auction not in time_dict:
            time_dict[auction] = [time]
        elif time not in time_dict[auction]:
            time_dict[auction] += [time]
            
            
    difference_list = []
    
    for times in time_dict.values():
        
        diffs = []
        
        if len(times) == 1:
            diffs.append(0)
        
        for index in range(0,len(times)-1):
            diff = times[index + 1] - times[index]
            diffs.append(diff)
         
        difference_list.append(mean(diffs))

    return median(difference_list)

# def time_diff_func(group):
    
#     for index in group.time:
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

res = response_difference_per_auc.apply(response_difference)
res

In [None]:
response_difference = pd.DataFrame(res)
response_difference = response_difference.rename(columns={0:'response_difference'})
response_difference = response_difference.reset_index()
response_difference

In [None]:
def to_ones(value):
    if value == 0:
        return 1
    else:
        return value

In [None]:
response_difference['changed_Os_to_1s'] = response_difference['response_difference'].apply(to_ones)
response_difference

In [None]:
response_difference['log_difference'] = (response_difference['changed_Os_to_1s']+1).transform(np.log)
response_difference

In [None]:
bids_response_diff = train_df[['bidder_id','outcome']].merge(response_difference,on = 'bidder_id',how='left' )
bids_response_diff

In [None]:
bots_response_diff = bids_response_diff[bids_response_diff['outcome'] == 1]
bots_response_diff['response_difference'].mean()

In [None]:
humans_response_diff = bids_response_diff[bids_response_diff['outcome'] == 0]
humans_response_diff['response_difference'].mean()

In [None]:
response_difference['log_difference'] = (response_difference['response_difference']+1).transform(np.log)
response_difference['log_difference'].min()

In [None]:
change_url_per_auc = bids_df.groupby("bidder_id")

def change_url_auc(group):
    
    changes_dict = {}
    for row_index, row in group.iterrows():
        auction = row['auction']
        url = row['url']
        
        if auction not in changes_dict:
            changes_dict[auction] = [url]
        elif url not in changes_dict[auction]:
            changes_dict[auction] += [url]

    change_list = []
    for changes in changes_dict.values():
        change_list.append(len(changes))
        

    return mean(change_list)
                
        
        
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

url_change = change_url_per_auc.apply(change_url_auc)
url_change

In [None]:
change_url_freq = pd.DataFrame(url_change)
change_url_freq = change_url_freq.rename(columns={0:'change_url_freq'})
change_url_freq = change_url_freq.reset_index()
change_url_freq['change_url_freq'].max()

In [None]:
change_url_freq['log_change_url_freq'] = (change_url_freq['change_url_freq']+1).transform(np.log)
change_url_freq

In [None]:
change_ip_per_auc = bids_df.groupby("bidder_id")

def change_url_auc(group):
    
    changes_dict = {}
    for row_index, row in group.iterrows():
        auction = row['auction']
        ip = row['ip']
        
        if auction not in changes_dict:
            changes_dict[auction] = [ip]
        elif ip not in changes_dict[auction]:
            changes_dict[auction] += [ip]

    change_list = []
    for changes in changes_dict.values():
        change_list.append(len(changes))
        

    return mean(change_list)
                
        
        
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

ip_change = change_ip_per_auc.apply(change_url_auc)
ip_change

In [None]:
change_ip_freq = pd.DataFrame(ip_change)
change_ip_freq = change_ip_freq.rename(columns={0:'change_ip_freq'})
change_ip_freq = change_ip_freq.reset_index()
change_ip_freq['log_change_ip_freq'] = (change_ip_freq['change_ip_freq']+1).transform(np.log)
change_ip_freq

In [None]:
countries_per_auc = bids_df.groupby("bidder_id")

def change_country_per_auc(group):
    
    changes_dict = {}
    for row_index, row in group.iterrows():
        auction = row['auction']
        country = row['country']
        
        if auction not in changes_dict:
            changes_dict[auction] = [country]
        elif country not in changes_dict[auction]:
            changes_dict[auction] += [country]
    
    change_list = []
    for changes in changes_dict.values():
        change_list.append(len(changes))
        

    return mean(change_list)
                
        
        
#     first_time = []
#     diffs = []
#     for time in group.time:
    
#         if len(first_time) == 0:
#             first_time.append(time)
#             print(first_time)
#         else:
#             diff = time - first_time[0]
#             diffs.append(diff)
#         first_time.clear()
#         first_time.append(time)
    
#     if len(diffs) == 0:
#         return 0
#     return median(diffs)

no = countries_per_auc.apply(change_country_per_auc)
no

In [None]:
countries_per_auc = pd.DataFrame(no)
countries_per_auc = countries_per_auc.rename(columns={0:'countries_per_auc'})
countries_per_auc = countries_per_auc.reset_index()
countries_per_auc['countries_per_auc'].max()

In [None]:
countries_per_auc['log_countries_per_auc'] = (countries_per_auc['countries_per_auc']+1).transform(np.log)
countries_per_auc

We combine the new data we generated with <code>train_df</code> using <code>merge()</code>. Thereafter, we replace the NAs with 0 using <code>.fillna()</code>

In [None]:
feature_set = train_df.merge(bidder_counts, on='bidder_id', how='left')  # Make sure NOT to use the default inner join (how='inner')
# print(feature_set.isnull().sum())

feature_set['num_bids'] = feature_set['num_bids'].fillna(0)  # fill nas with zeros
feature_set

In [None]:
from statistics import mean

bid_merch = bids_df.groupby("bidder_id")

def common_merch(group):
    merch_dict = {}
    for merch in group.merchandise:
        if merch not in merch_dict:
            merch_dict[merch] = 1
        else:
            merch_dict[merch] += 1
    return max(merch_dict, key = merch_dict.get)

common_merch = bid_merch.apply(common_merch)
common_merch

In [None]:
# common_merch = pd.DataFrame(common_merch)
# common_merch = common_merch.rename(columns={0:'common_merch'})
# common_merch = common_merch.reset_index()
# common_merch_OH_encoding = pd.get_dummies(common_merch['common_merch']).drop(columns='home goods')
# common_merch_OH_encoding_concat_df = pd.concat([common_merch, common_merch_OH_encoding], axis=1)
# common_merch_OH_encoding_concat_df

In [None]:
# common_merch_OH_encoding_concat_df = common_merch_OH_encoding_concat_df.drop(columns = 'common_merch')

In [None]:
# from statistics import mean

# bid_country = bids_df.groupby("bidder_id")

# def common_country(group):
#     country_dict = {}
#     for country in group.country:
#         if country not in country_dict:
#             country_dict[country] = 1
#         else:
#             country_dict[country] += 1
#     return max(country_dict, key = country_dict.get)

# common_country = bid_country.apply(common_country)
# common_country

In [None]:
# common_country = pd.DataFrame(common_country)
# common_country = common_country.rename(columns={0:'common_country'})
# common_country = common_country.reset_index()
# common_country_OH_encoding = pd.get_dummies(common_country['common_country']).drop(columns='in')
# common_country_OH_encoding_concat_df = pd.concat([common_country, common_country_OH_encoding], axis=1)
# common_country_OH_encoding_concat_df.drop(columns = 'common_country')
# common_country_OH_encoding_concat_df

In [None]:
# common_country_OH_encoding_concat_df

## 2b. Adding more feature columns with `.nunique()`

Using this method, we can further generate other potentially useful features for each bidder id. 

To demonstrate this, let's try something other than count, for e.g. <code>.nunique()</code>.

**Note: Depending on your computer RAM, the next cell of code may take up to few minutes to run.**

In [None]:
bidder_unique = bids_df.groupby("bidder_id").nunique()
# bidder_unique

In [None]:
bidder_unique = bidder_unique.drop(columns='bidder_id', errors='ignore').reset_index()
# bidder_unique

In [None]:
bidder_unique['url_per_auc'] = bidder_unique.url /  bidder_unique.auction
bidder_unique['log_url_per_auc'] = (bidder_unique['url_per_auc']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['ips_per_auc'] = bidder_unique.ip /  bidder_unique.auction
bidder_unique['log_ips_per_auc'] = (bidder_unique['ips_per_auc']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['dev_per_auc'] = bidder_unique.device /  bidder_unique.auction
bidder_unique['log_dev_per_auc'] = (bidder_unique['dev_per_auc']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['time_per_auc'] = bidder_unique.time /  bidder_unique.auction
bidder_unique['log_time_per_auc'] = (bidder_unique['time_per_auc']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['time_per_dev'] = bidder_unique.time /  bidder_unique.device
bidder_unique['log_bids_per_dev'] = (bidder_unique['time_per_dev']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['time_per_country'] = bidder_unique.time /  bidder_unique.country
bidder_unique['log_bids_per_country'] = (bidder_unique['time_per_country']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['ip_per_country'] = bidder_unique.ip /  bidder_unique.country
bidder_unique['log_bids_per_country'] = (bidder_unique['ip_per_country']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['url_per_country'] = bidder_unique.url /  bidder_unique.country
bidder_unique['log_bids_per_country'] = (bidder_unique['url_per_country']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['dev_per_country'] = bidder_unique.device /  bidder_unique.country
bidder_unique['log_bids_per_country'] = (bidder_unique['dev_per_country']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['time_per_ip'] = bidder_unique.time /  bidder_unique.ip
bidder_unique['log_bids_per_ip'] = (bidder_unique['time_per_ip']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['time_per_url'] = bidder_unique.time /  bidder_unique.url
bidder_unique['log_bids_per_url'] = (bidder_unique['time_per_url']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique = bidder_unique.merge(bidder_counts, on='bidder_id', how='left')
bidder_unique

In [None]:
bidder_unique['bid_per_auc'] = bidder_unique.num_bids /  bidder_unique.auction
bidder_unique['log_bid_per_auc'] = (bidder_unique['bid_per_auc']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['bids_per_country'] = bidder_unique.num_bids /  bidder_unique.country
bidder_unique['log_bids_per_country'] = (bidder_unique['bids_per_country']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['bids_per_ip'] = bidder_unique.num_bids /  bidder_unique.ip
bidder_unique['log_bids_per_ip'] = (bidder_unique['bids_per_ip']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['bids_per_url'] = bidder_unique.num_bids /  bidder_unique.url
bidder_unique['log_bids_per_ip'] = (bidder_unique['bids_per_url']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique['bids_per_dev'] = bidder_unique.num_bids /  bidder_unique.device
bidder_unique['log_bids_per_ip'] = (bidder_unique['bids_per_dev']+1).transform(np.log)
# bidder_unique

In [None]:
bidder_unique = bidder_unique.merge(same_response_counts, on='bidder_id', how='left')
# bidder_unique

In [None]:
bidder_unique = bidder_unique.merge(response_difference, on='bidder_id', how='left')
# bidder_unique

In [None]:
bidder_unique = bidder_unique.merge(change_url_freq, on='bidder_id', how='left')

In [None]:
bidder_unique = bidder_unique.merge(change_ip_freq, on='bidder_id', how='left')
# bidder_unique

In [None]:
bidder_unique = bidder_unique.merge(countries_per_auc, on='bidder_id', how='left')
# bidder_unique

In [None]:
# bidder_unique = bidder_unique.merge(common_merch_OH_encoding_concat_df, on='bidder_id', how='left')
# # bidder_unique

In [None]:
# feature_set = bidder_unique.merge(common_country_OH_encoding_concat_df, on='bidder_id', how='left')
# feature_set

In [None]:
bidder_unique = bidder_unique.merge(bids_entropy_ip, on='bidder_id', how='left')

In [None]:
bidder_unique = bidder_unique.merge(bids_entropy_url, on='bidder_id', how='left')

In [None]:
bidder_unique = bidder_unique.merge(bids_entropy_coun, on='bidder_id', how='left')

In [None]:
bidder_unique = bidder_unique.merge(bids_entropy_auc, on='bidder_id', how='left')

In [None]:
bidder_unique = bidder_unique.merge(ips_entropy_per_auc, on='bidder_id', how='left')

In [None]:
bidder_unique['log_url'] = (bidder_unique['url']+1).transform(np.log)
bidder_unique['log_auction'] = (bidder_unique['auction']+1).transform(np.log)
bidder_unique['log_device'] = (bidder_unique['device']+1).transform(np.log)
bidder_unique['log_time'] = (bidder_unique['time']+1).transform(np.log)
bidder_unique['log_country'] = (bidder_unique['country']+1).transform(np.log)
bidder_unique['log_ip'] = (bidder_unique['ip']+1).transform(np.log)

In [None]:
bidder_unique

Again, we combine the new data we generated with <code>train_df</code> using <code>merge()</code>.

In [None]:
train_set = train_df.merge(bidder_unique, on='bidder_id', how='left')
test_set = test_df.merge(bidder_unique, on='bidder_id', how='left')
train_set

In [None]:
train_set.replace([np.inf, -np.inf], np.nan, inplace=True)
test_set.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
checking_see_df = train_set[train_set['outcome'] == 1.0]
checking_see_df

#### test_set

In [None]:
train_set.columns

In [None]:
# feature_cols = train_set.drop(columns = ['outcome','bidder_id','payment_account','address','bid_id','changed_Os_to_1s'] )
# feature_cols =  list(feature_cols.columns)
# X = train_set[feature_cols]
# y = train_set['outcome']

# import pandas as pd
# import numpy as np
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# from scipy.stats import chi2_contingency

# bestfeatures = SelectKBest(score_func=chi2, k=10)
# fit = bestfeatures.fit(X,y)
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X.columns)
# #concat two dataframes for better visualization 
# featureScores = pd.concat([dfcolumns,dfscores],axis=1)
# featureScores.columns = ['Specs','Score']  #naming the dataframe columns
# print(featureScores.nlargest(30,'Score'))  #print 10 best feat

# ['response_difference','ip','time','num_bids','bids_per_dev','bids_per_url','bids_per_country','bids_per_ip','url','auction']

In [None]:
train_set.isnull().sum()

In [None]:
train_set = train_set.fillna(train_set.mean())
test_set = test_set.fillna(test_set.mean())

In [None]:
train_set = train_set.interpolate(method="cubic")
test_set = test_set.interpolate(method="cubic")

In [None]:
train_set

In [None]:
test_set

In [None]:
train_set.isnull().sum()

In [None]:
import plotly.express as px

X_corr = X.corr()
fig = px.imshow(X_corr, color_continuous_scale='solar')
fig.show()

3.Model Training
===
***

## 3a. Selecting the feature columns

We select the features we want to use in predicting our outcome

In [None]:
# feature_cols = train_set.drop(columns = ['merchandise','outcome','bidder_id', 'payment_account', 'address','bid_id','log_bids_per_auc',
#  'log_url_per_auc',
#  'log_ips_per_auc',
#  'log_bids_per_dev',
#  'log_bids_per_ip','url_per_auc',
#  'log_bids_per_url','bids_per_url',
#  'log_num_bids','num_instant_counts','log_bids_per_country',
#  'log_difference','response_difference','changed_Os_to_1s','auction',
#  'device','log_counts',
#  'time',
#  'country',
#  'ip',
#  'url'] )

#BEST SCORE
# feature_cols = ['auction','device','time','country','ip','url', 
#                 'url_per_auc', 'bids_per_dev', 'bids_per_country', 
#                 'bids_per_ip', 'bids_per_url', 'num_bids', 
#                 'change_ip_freq', 'countries_per_auc',
# 'bids_entropy_per_ip',
#  'log_bids_entropy_ip',
#  'bids_entropy_per_url',
#  'log_bids_entropy_url',
#  'bids_entropy_country',
#  'log_bids_entropy_country']

feature_cols = importances_abv_avg

# feature_cols =  train_set.drop(columns = ['bid_id','merchandise','outcome','bidder_id', 'payment_account', 
#  'address','changed_Os_to_1s'
# ])


# # ['bidder_id', 'payment_account', 'address', 'outcome', 'bid_id',
# #        'auction', 'merchandise', 'device', 'time', 'country', 'ip', 'url',
# #        'bids_per_auc', 'url_per_auc', 'ips_per_auc', 'bids_per_dev',
# #        'bids_per_country', 'bids_per_ip', 'bids_per_url', 'common_merch',
# #        'auto parts', 'books and music', 'clothing', 'computers', 'furniture',
# #        'jewelry', 'mobile', 'office equipment', 'sporting goods',]

# feature_cols =  list(feature_cols.columns)

X = train_set[feature_cols]
y = train_set['outcome']

X_kaggle = test_set[feature_cols] # Answers to this are hidden, so you can't use these 4700 rows for model evaluation!

In [None]:
feature_cols

In [None]:
X_kaggle

In [None]:
rf_model = rf_model.fit(X,y)

In [None]:
import time
import numpy as np

start_time = time.time()
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

forest_importances = pd.Series(importances, index=feature_cols)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
importances_abv_avg = forest_importances[forest_importances > forest_importances.mean()]

In [None]:
importances_abv_avg = importances_abv_avg.reset_index()

In [None]:
importances_abv_avg = list(importances_abv_avg['index'])

In [None]:
importances_abv_avg

## 3b. Implementing the Machine Learning Model

Here we use the <code>RandomForestClassifier()</code> from Sklearn

In [None]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

cat_model = CatBoostClassifier(learning_rate= 0.1, random_seed = 5)
rf_model = RandomForestClassifier(random_state = 0)
clf_model = ExtraTreesClassifier(n_estimators=100, random_state=0)
ada_model = AdaBoostClassifier(n_estimators=100, random_state=0)

## 3c. Train-Test Split

We split the data to facilitate the evaluation of the model

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
import statistics
from sklearn import metrics


skf = StratifiedKFold(n_splits=5, shuffle= True)
skf.get_n_splits(X, y)
# print(skf)

k_fold_Acc = []
k_fold_Auc = []

for train_index, test_index in skf.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred)
    k_fold_Acc.append(accuracy)
    k_fold_Auc.append(auc)
    
    
print('Average 10-fold acc:', round(statistics.mean(k_fold_Acc),4))
print('Average 10-fold auc:', round(statistics.mean(k_fold_Auc),4))

## 3d. Evaluate your model (using only train.csv)

We generate a randomforest model by fitting the training data using <code>.fit()</code>, and thereafter generate predictions using <code>.predict()</code>

We evalute the model's accuracy using <code>metrics.accuracy_score()</code>

In [None]:
# from sklearn import metrics

# accuracy = metrics.accuracy_score(y_test, y_pred)
# auc = metrics.roc_auc_score(y_test, y_pred)

# print('Accuracy:', round(accuracy, 4) * 100, '%')
# print('AUC:', round(auc, 4)) 

# # AUC definitely has room for improvement (It's your turn!)
# # But first, let's look at how we can generate the predictions.csv in the next section.

## 4a. Refit final model on entire train.csv

In [None]:
final_rf_model = RandomForestClassifier(random_state = 0)

final_rf_model = final_rf_model.fit(X, y)

In [None]:
final_cat_model = CatBoostClassifier(learning_rate= 0.1, random_seed = 5)
final_cat_model.fit(X, y)

In [None]:
ada_model.fit(X, y)
ada_pred = ada_model.predict(X_kaggle)

## 4b. Generate predictions on 4700 rows of test.csv features into the model

In [None]:
X_kaggle.shape  # This is equivalent to your test_set[feature_cols]

Here we use <code>.predict_proba</code> instead of <code>.predict()</code> because this is the format which the Kaggle platform requires

In [None]:
probabilities = final_rf_model.predict_proba(X_kaggle)
probabilities

In [None]:
final_cat_pred = final_cat_model.predict_proba(X_kaggle)
final_cat_pred

In [None]:
kaggle_preds = probabilities[:,1]  # Extract values from the rightmost column
kaggle_preds

Make use of the <code>.to_csv()</code> function to output your predictions in the form of a csv, which will be the format you will be required to submit to Kaggle.

In [None]:
output_dataframe = pd.DataFrame({
    'bidder_id': test_set['bidder_id'],
    'prediction': kaggle_preds
})
output_dataframe.to_csv('my_predictions.csv', index=False)  

# Check for the .csv in the same folder as your Jupyter Notebook
# Try uploading this .csv to the Kaggle competition!

<div class="alert alert-block alert-warning">
<b>All the best to every team - may the best model win!</b>
</div>