## Imports

In [1]:
import numpy as np
import pandas as pd

import datetime as dt

import os #scanning folders

## Load Bidding history

#### Define data load function

In [2]:
def load_bidding_history(directory):

    bidding_history = pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])    

    for file in os.scandir(directory):
        bidding_history = bidding_history.append(pd.read_csv(file, usecols=['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'], parse_dates=['Ending Time', 'Timestamp']), ignore_index=True)

    return bidding_history

#### Load data

In [3]:
df_bids_antiques = load_bidding_history('biddingdata/antiques')

##### Authors note: eBay only grants public access to auctions within the last several weeks. To increase the number of auctions in the computer category to a suitable scope, I conducted to runs which have to be merged together at this point.

In [4]:
df_bids_computers = load_bidding_history('biddingdata/computers')
df_bids_computers = df_bids_computers.append(load_bidding_history('biddingdata/computers2'))

## Collect Metadata

#### Number of auctions

In [13]:
df_bids_antiques['Itemnumber'].nunique()

106794

In [14]:
df_bids_computers['Itemnumber'].nunique()

56466

#### Number of Biddings

In [15]:
len(df_bids_antiques)

513736

In [16]:
len(df_bids_computers)

519710

## Optimize the data structure

#### Exclude items with less than 2 bidders

In [17]:
relevant_itemnumbers = df_bids_antiques.loc[:,['Itemnumber', 'Bidder']].groupby(by=["Itemnumber"]).nunique()
relevant_itemnumbers = relevant_itemnumbers.loc[relevant_itemnumbers['Bidder'] > 1]
relevant_itemnumbers = relevant_itemnumbers.index.tolist()
relevant_itemnumbers

df_bids_antiques = df_bids_antiques[df_bids_antiques['Itemnumber'].isin(relevant_itemnumbers)]

In [18]:
relevant_itemnumbers = df_bids_computers.loc[:,['Itemnumber', 'Bidder']].groupby(by=["Itemnumber"]).nunique()
relevant_itemnumbers = relevant_itemnumbers.loc[relevant_itemnumbers['Bidder'] > 1]
relevant_itemnumbers = relevant_itemnumbers.index.tolist()
relevant_itemnumbers

df_bids_computers = df_bids_computers[df_bids_computers['Itemnumber'].isin(relevant_itemnumbers)]

#### Update metadata after removing auctions with only 1 Bidding

In [20]:
df_bids_antiques['Itemnumber'].nunique()

50132

In [21]:
df_bids_computers['Itemnumber'].nunique()

36937

#### Remove timezones from data

In [19]:
df_bids_computers['Ending Time'] = df_bids_computers['Ending Time'].apply(lambda x: x.replace(tzinfo=None))
df_bids_computers['Timestamp'] = df_bids_computers['Timestamp'].apply(lambda x: x.replace(tzinfo=None))

df_bids_computers2['Ending Time'] = df_bids_computers2['Ending Time'].apply(lambda x: x.replace(tzinfo=None))
df_bids_computers2['Timestamp'] = df_bids_computers2['Timestamp'].apply(lambda x: x.replace(tzinfo=None))

NameError: name 'df_bids_computers2' is not defined

#### Create new column containing the remaining time when the bid was submitted

In [107]:
df_bids_antiques['Time Left'] = df_bids_antiques['Ending Time'] - df_bids_antiques['Timestamp']

In [108]:
df_bids_computers['Time Left'] = df_bids_computers['Ending Time'] - df_bids_computers['Timestamp']

#### Remove columns not needed anymore

In [109]:
df_bids_antiques = df_bids_antiques.drop(columns=['Ending Time', 'Timestamp'])

In [110]:
df_bids_computers = df_bids_computers.drop(columns=['Ending Time', 'Timestamp'])

#### Pickle dataframes for further use

In [111]:
df_bids_antiques.to_pickle("processeddata/bids_antiques.pkl")

In [112]:
df_bids_computers.to_pickle("processeddata/bids_computers.pkl")