# Ebay Auction Research - Data Generation 

## Config

In [1]:
%config IPCompleter.greedy=True

## Imports

In [2]:
import pandas as pd

from bs4 import BeautifulSoup #for HTML parsing
from time import sleep #To prevent overwhelming the server between connections
from lxml import html

import requests

import re #regular expressions

import csv #to save as csv

import datetime #to convert timestamps
from pytz import timezone #to convert timestamps

import math # to round up

## Define lists with the links to the dedicated categories to use

In [3]:
categories_antiques = ['Antiquities/37903/bn_1865503',
            'Architectural-Garden-Antiques/4707/bn_1865433',
            'Asian-Antiques/20082/bn_1865025',
            'Antique-Decorative-Arts/20086/bn_1849288',
            'Ethnographic-Antiques/2207/bn_1865604',
            'Antique-Furniture/20091/bn_1865102',
            'Home-Hearth-Antiques/163008/bn_1856765',
            'Incunabula/22422/bn_16563046',
            'Antique-Linens-Textiles/181677/bn_1850579',
            'Antique-Manuscripts/23048/bn_16561733',
            'Antique-Maps-Atlases-Globes/37958/bn_1856736',
            'Maritime-Antiques/37965/bn_1865553',
            'Mercantile-Trade-Factory-Antiques/163091/bn_1865217',
            'Antique-Musical-Instruments/181726/bn_1857439',
            'Other-Antiques/12/bn_1849364',
            'Period-Style-Antiques/100927/bn_1861165',
            'Antique-Primitives/1217/bn_1852967',
            'Antique-Rugs-Carpets/37978/bn_1860491',
            'Science-Medicine-Antiques/20094/bn_1865445',
            'Sewing-Antiques/156323/bn_1852475',
            'Silver-Antiques/20096/bn_1865522'
             ]

In [4]:
categories_computers = ['Apple-Desktops-All-In-One-Computers/111418/bn_661869',
            'Apple-Laptops/111422/bn_320025',
            'Apple-Tablets-eReaders/171485/bn_319675',
            'Cameras-Photo/625/bn_1865546',
            'Cell-Phones-Smartphones/9355/bn_320094',
            'Computer-Drives-Storage-Blank-Media/165/bn_738891',
            'Computer-Monitors/80053/bn_317528',
            'Computer-Projectors/25321/bn_320054',
            'Computer-Printers/1245/bn_320031',
            'Computer-Scanners/11205/bn_320028',
            'PC-Desktops-All-In-One-Computers/179/bn_661752',
            'PC-Laptops-Netbooks/177/bn_317584',
            'Smart-Watches/178893/bn_152365'
             ]

## Scrape the category pages to collect links to auctions

### Definitions

In [10]:
def get_category_page(category, pagenumber):
    
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
    try:
        page = requests.get('https://www.ebay.com/b/'+ str(category) + '?LH_Auction=1&LH_Complete=1&LH_Sold=1&_pgn=' + str(pagenumber) + '&rt=nc', headers=headers, timeout=10)
    except requests.exceptions.RequestException:
        print('get_category_page - Timeout - Page: ' + str(pagenumber))
        sleep(60)
        print('get_category_page - Retry - Page: ' + str(pagenumber))
        page = get_category_page(category, pagenumber)
    return page

In [11]:
def scrape_categories(categories, startpage, endpage):

    urlset = set()
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
    
    for catnumber, category in enumerate(categories):
        for pagenumber in range(startpage, endpage):
            page = get_category_page(category, pagenumber)
            soup = BeautifulSoup(page.content, 'lxml')

            for item in soup.findAll('a', href=True):
                if (re.fullmatch(r'https://www.ebay.com/itm/.*', item['href']) != None):
                    urlset.add(item['href'].split('?')[0])
        urllist = list(urlset)
        urllist_to_csv(urllist, 'urls_antiques_' + str(catnumber) + '.csv')
        print ('Category finished: ' + str(category))
        
    urllist = list(urlset)
    return urllist

In [12]:
def urllist_to_csv(urllist, filename):
    file = open(filename, 'w+', newline ='')  
    with file:     
        write = csv.writer(file, quoting=csv.QUOTE_ALL) 
        write.writerow(['ID', 'URL'])
        for i, item in enumerate(urllist):
            write.writerow([i, item])

### First run

In [64]:
urllist_antiques = scrape_categories(categories_antiques, 1, 201)
urllist_to_csv(urllist_antiques, 'urls_antiques.csv')

Category finished: Antiquities/37903/bn_1865503
Category finished: Architectural-Garden-Antiques/4707/bn_1865433
Category finished: Asian-Antiques/20082/bn_1865025
Category finished: Antique-Decorative-Arts/20086/bn_1849288
Category finished: Ethnographic-Antiques/2207/bn_1865604
Category finished: Antique-Furniture/20091/bn_1865102
Category finished: Home-Hearth-Antiques/163008/bn_1856765
Category finished: Incunabula/22422/bn_16563046
Category finished: Antique-Linens-Textiles/181677/bn_1850579
Category finished: Antique-Manuscripts/23048/bn_16561733
Category finished: Antique-Maps-Atlases-Globes/37958/bn_1856736
Category finished: Maritime-Antiques/37965/bn_1865553
Category finished: Mercantile-Trade-Factory-Antiques/163091/bn_1865217
Category finished: Antique-Musical-Instruments/181726/bn_1857439
Category finished: Other-Antiques/12/bn_1849364
Category finished: Period-Style-Antiques/100927/bn_1861165
Category finished: Antique-Primitives/1217/bn_1852967
Category finished: Antique

In [181]:
urllist_computers = scrape_categories(categories_computers, 1, 201)
urllist_to_csv(urllist_computers, 'urls_computers.csv')

Category finished: Apple-Desktops-All-In-One-Computers/111418/bn_661869
Category finished: Apple-Laptops/111422/bn_320025
Category finished: Apple-Tablets-eReaders/171485/bn_319675
Category finished: Cameras-Photo/625/bn_1865546
Category finished: Cell-Phones-Smartphones/9355/bn_320094
Category finished: Computer-Drives-Storage-Blank-Media/165/bn_738891
Category finished: Computer-Monitors/80053/bn_317528
Category finished: Computer-Projectors/25321/bn_320054
Category finished: Computer-Printers/1245/bn_320031
Category finished: Computer-Scanners/11205/bn_320028
Category finished: PC-Desktops-All-In-One-Computers/179/bn_661752
Category finished: PC-Laptops-Netbooks/177/bn_317584
Category finished: Smart-Watches/178893/bn_152365


### Second run

In [None]:
urllist_computers2 = scrape_categories(categories_computers, 1, 201)
urllist_to_csv(urllist_computers2, 'urls_computers2.csv')

## Use the collected links to create two lists with the particular bids

### Functions definition

In [27]:
def get_bidding_history(itemnumber):
    
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
    
    try:
        page = requests.get('https://www.ebay.com/bfl/viewbids/' + str(itemnumber), headers=headers, timeout=10)
    except requests.exceptions.Timeout:
        print('get_bidding_history - Timeout - Item: ' + str(itemnumber))
        sleep(60)
        print('get_bidding_history - Retry - Item: ' + str(itemnumber))
        page = get_bidding_history(itemnumber)
    return page

In [28]:
def scrape_biddings(itemnumber):

    df = pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])

    page = get_bidding_history(itemnumber)

    soup = BeautifulSoup(page.content, 'lxml')
    
    if (soup.find('div', {'class' : 'offer-title-top_panel_main'}) == None): #return empty df when auction is not available anymore
        return pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])
    
    title = soup.find('div', {'class' : 'offer-title-top_panel_main'}).get_text()
    
    ending_time = ( timezone('US/Pacific')
                    .localize(datetime.datetime.strptime(
                        soup.find('div', {'class' : 'app-bid-info_wrapper'})
                            .find('ul')
                            .findAll('li')[2]
                            .find('div')
                            .get_text()[:-4]
                        , '%d %b %Y at %I:%M:%S%p'
                    ))
                  )

    table = soup.find('table', {'aria-label' : 'Bid history'})
    rows = table.find_all('tr', {'class' : 'ui-component-table_tr_detailinfo'})
    
    for row in rows[:-1]: #starting with the second element to ignore the header row and ending 2 before last
        
        data = row.find_all('td')
        
        if (data[0].find('a') == None): # return empty df when no bidder information is available
            return pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])
        bidder = data[0].find('a').get_text()
        
        feedback_score = (row.find('span', {'class' : 'app-feedback-star_count'}))
        if (feedback_score.get_text() == 'private'): # return empty df when bidder hides his feedback score
            return pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])
        feedback_score.find('span', {'class' : 'clipped'}).clear()
        feedback_score = feedback_score.get_text()
        
        bid_amount = re.sub('\D', '', data[1].get_text())
 
        timestamp = ( timezone('US/Pacific')
                        .localize(datetime.datetime.strptime(
                        data[2].get_text()[:-4], '%d %b %Y at %I:%M:%S%p'))
                    )
        df = df.append(
            {'Itemnumber': itemnumber,
            'Title': title,
            'Ending Time': ending_time, 
            'Timestamp': timestamp, 
            'Bidder': bidder, 
            'feedback_score': feedback_score,
            'Bid Amount': bid_amount}, 
            ignore_index=True)
        
    return df
        

#data = (scrape_biddings(402516064006))
#data = (scrape_biddings(383755629593))

In [29]:
def process_itemlist_and_scrape(itemlist, output_path):

    data = pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])

    for ind, item in itemlist.items():
        data = data.append(scrape_biddings(item), ignore_index=True)
        if (ind%50 == 0):
            print('Current index: ' + str(ind))
        if (ind%1000 == 0 and ind != 0):
            data.to_csv( output_path + str(int(ind/1000)) + '.csv', quoting=csv.QUOTE_ALL)
            data = pd.DataFrame(columns = ['Itemnumber','Title','Ending Time', 'Timestamp', 'Bidder', 'feedback_score', 'Bid Amount'])
            print('Items ' + str(ind - 999) + ' - ' + str(ind) + ' processed and saved!') 
    data.to_csv(output_path + str( int( math.ceil( ind/1000))) + '.csv', quoting=csv.QUOTE_ALL)
    print('Succeeded!') 

### Preparation of item lists

In [165]:
itemlist_antiques = pd.read_csv('urls_antiques.csv')
itemlist_antiques = itemlist_antiques['URL']
itemlist_antiques = itemlist_antiques.str.slice(start = -12)
itemlist_antiques

0         224281728768
1         184540354108
2         174541405657
3         363190833780
4         233823195168
              ...     
123883    154146366938
123884    383780417525
123885    274568379654
123886    174508441879
123887    254752029537
Name: URL, Length: 123888, dtype: object

In [12]:
itemlist_computers = pd.read_csv('urls_computers.csv')
itemlist_computers = itemlist_computers['URL']
itemlist_computers = itemlist_computers.str.slice(start = -12)
itemlist_computers

0        333762762502
1        284129210813
2        264966024795
3        224202309051
4        274581752358
             ...     
45469    143805985854
45470    392992800201
45471    203218253476
45472    203217042935
45473    274544277360
Name: URL, Length: 45474, dtype: object

In [None]:
itemlist_computers2 = pd.read_csv('urls_computers2.csv')
itemlist_computers2 = itemlist_computers2['URL']
itemlist_computers2 = itemlist_computers2.str.slice(start = -12)
itemlist_computers2

itemlist_computers2 = itemlist_computers2[~itemlist_computers2.isin(itemlist_computers)]

itemlist_computers2 = itemlist_computers2.reset_index(drop=True)

itemlist_computers2

### Process itemlists and save results to csv

In [None]:
process_itemlist_and_scrape(itemlist_antiques, 'biddingdata/antiques/biddingdata_antiques_')

In [192]:
process_itemlist_and_scrape(itemlist_computers, 'biddingdata/computers/biddingdata_computers_')

Current index: 35050
Current index: 35100
Current index: 35150
Current index: 35200
Current index: 35250
Current index: 35300
Current index: 35350
Current index: 35400
Current index: 35450
Current index: 35500
Current index: 35550
Current index: 35600
Current index: 35650
Current index: 35700
Current index: 35750
Current index: 35800
Current index: 35850
Current index: 35900
Current index: 35950
Current index: 36000
Items 35001 - 36000 processed and saved!
Current index: 36050
Current index: 36100
Current index: 36150
Current index: 36200
Current index: 36250
Current index: 36300
Current index: 36350
Current index: 36400
Current index: 36450
Current index: 36500
Current index: 36550
Current index: 36600
Current index: 36650
Current index: 36700
Current index: 36750
Current index: 36800
Current index: 36850
Current index: 36900
Current index: 36950
Current index: 37000
Items 36001 - 37000 processed and saved!
Current index: 37050
Current index: 37100
Current index: 37150
Current index: 

In [None]:
process_itemlist_and_scrape(itemlist_computers2, 'biddingdata/computers2/biddingdata_computers_')