# Feature engineering part 2 (Dealing with time)

In [3]:
import pandas as pd
from fep import FEP, load_features
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [58]:
# read data and split train and test
bids = pd.read_csv('../Data/facebook/bids.csv')
bidder = pd.read_csv('../Data/facebook/train.csv')
X_bidder = bidder.drop(['outcome'],axis=1)
y_bidder = bidder['outcome']
X_train, X_test, y_train, y_test = train_test_split(X_bidder, y_bidder, random_state=42, stratify=y_bidder)
df_train = pd.merge(bids, pd.concat([X_train, y_train], axis=1), on='bidder_id', how='inner')  
df_test = pd.merge(bids, pd.concat([X_test, y_test], axis=1), on='bidder_id', how='inner') 

# Encoding bidder_id

unique_bidders = pd.concat([df_train['bidder_id'], df_test['bidder_id']]).unique()
bidder_mapping = {bidder_id: idx for idx, bidder_id in enumerate(unique_bidders)}
df_train['encoded_bidder_id'] = df_train['bidder_id'].map(bidder_mapping)
df_test['encoded_bidder_id'] = df_test['bidder_id'].map(bidder_mapping)


# Encoding auction 

auctions = pd.concat([df_train['auction'], df_test['auction']]).unique()
auction_mapping = {auction_id: idx for idx, auction_id in enumerate(auctions)}

df_train['encoded_auction'] = df_train['auction'].map(auction_mapping)
df_test['encoded_auction'] = df_test['auction'].map(auction_mapping)


# dropping not encoded feature

df_train.drop(['bidder_id', 'auction','payment_account','url','ip','country','device','merchandise','address', 'encoded_auction'],axis=1,inplace=True)
df_test.drop(['bidder_id', 'auction','payment_account','url','ip','country','device','merchandise','address', 'encoded_auction'],axis=1,inplace=True)


### Dealing with time

We know that time is obscured, but preserve order and scale, so we can work with it.

##### Search for concurrent bid

In [59]:
df_train

Unnamed: 0,bid_id,time,outcome,encoded_bidder_id
0,1,9759243157894736,0.0,0
1,25,9759243315789473,0.0,0
2,39,9759243368421052,0.0,0
3,42,9759243368421052,0.0,0
4,68,9759243526315789,0.0,0
...,...,...,...,...
2356670,7614200,9709028052631578,0.0,1483
2356671,7617228,9709041947368421,0.0,1483
2356672,7639758,9709146315789473,0.0,1484
2356673,7651414,9709200631578947,0.0,1485


In [61]:
time_df = df_train.sort_values(['encoded_bidder_id', 'time'])   
temp_df = time_df.drop(['bid_id', 'outcome'],axis=1,inplace=False)
temp_df = temp_df.groupby('encoded_bidder_id').diff().fillna(-1)   # We obtain the difference for each consecutive bid
concurrent_bid = []
for i in range(len(temp_df)):
    if temp_df.iloc[i]['time'] == -1:
        concurrent_bid.append(0)
    elif temp_df.iloc[i]['time'] == 0:  # this is a concurrent bid
        concurrent_bid.append(1)
    else:
        concurrent_bid.append(0)

df_train['concurrent_flag'] = concurrent_bid