In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs

### Web Scraping #1: Getting a list of best selling GPUs from the newegg.ca

In [2]:
# this is getting the source code
page_list = [i for i in range(1,4)]
brand = []
name = []
price_list = []
shipping_list = []

# looping through 3 pages
for page in page_list:
    newegg_url = requests.get('https://www.newegg.ca/Desktop-Graphics-Cards/SubCategory/ID-48/Page-{}?Tid=7709&Order=3'.format(page))
    # turn it into a beatifulsoup object - lxml is the parser
    soup = bs.BeautifulSoup(newegg_url.text, 'lxml')
    # grab each product 
    containers = soup.findAll('div', {'class':'item-container'})
    price = soup.findAll('li', {'class':'price-current'})
    shipping = soup.findAll('li', {'class':'price-ship'})

    # getting names
    for container in containers:
        if container.div.div.a.img != None:
            brand.append(container.div.div.a.img['title'])
            name.append(container.a.img['title'])

    # getting prices
    for item in price:
        price_list.append(item.strong.text + item.sup.text)

    # getting prices of shipping
    for item in shipping:
        shipping_list.append(item.text.split()[0])

print(len(shipping_list))
print(len(price_list))
print(len(brand))
print(len(name))

108
108
108
108


In [3]:
price_list = [float(x.replace(',','')) for x in price_list]

In [4]:
frame = {'Name':name, 'Brand':brand, 'Price':price_list, 'Shipping':shipping_list}
gpu_df = pd.DataFrame(frame)
print(gpu_df.head(15))

                                                 Name     Brand   Price  \
0   GIGABYTE GeForce RTX 2070 DirectX 12 GV-N2070W...  GIGABYTE  549.99   
1   MSI GeForce GTX 1660 SUPER DirectX 12 GTX 1660...       MSI  338.99   
2   MSI GeForce GTX 1660 DirectX 12 GTX 1660 VENTU...       MSI  289.99   
3   EVGA GeForce RTX 2060 KO ULTRA GAMING Video Ca...      EVGA  459.99   
4   MSI GeForce RTX 2070 SUPER DirectX 12 RTX 2070...       MSI  789.99   
5   MSI GeForce RTX 2060 DirectX 12 RTX 2060 GAMIN...       MSI  519.99   
6   MSI GeForce RTX 2070 DirectX 12 RTX 2070 ARMOR...       MSI  559.99   
7   MSI GeForce RTX 2060 DirectX 12 RTX 2060 VENTU...       MSI  459.99   
8   GIGABYTE GeForce GTX 1650 SUPER WINDFORCE OC 4...  GIGABYTE  229.99   
9   GIGABYTE Radeon RX 580 GAMING 8G (rev. 2.0) Gr...  GIGABYTE  224.99   
10  MSI GeForce RTX 2070 DirectX 12 RTX 2070 VENTU...       MSI  609.99   
11  EVGA GeForce RTX 2070 SUPER KO GAMING Video Ca...      EVGA  704.99   
12  GIGABYTE GeForce GTX 

These are the 15 most sold GPUs on Newegg.ca

### Web Scraping #2: Steam Sale

In [5]:
# getting the source code
steam_url = requests.get('https://store.steampowered.com/search/?specials={}')
soup = bs.BeautifulSoup(steam_url.text, 'lxml')
all_container = soup.findAll('div', {'class':'responsive_search_name_combined'})

game_list = []
discount_list = []
original_price = []
dis_price_list = []

for container in all_container:
    game = container.findChildren()[1].text
    game_list.append(game)
    
    # percentage of discount 
    discount = container.findChildren('div')[3]
    if discount.div.span == None:
        discount_list.append('No Discount')
    else:
        discount_list.append(discount.div.span.text.replace('-',''))
    
    # original price
    o_price = container.findChildren('div')[3].findChildren('div')[1]
    if o_price.strike == None:
        original_price.append('0')
    else:
        original_price.append(o_price.strike.text.split()[1])
    
    # discounted price
    dis_price_line = container.findChildren('div')[3].findChildren('div')[1]
    line_split = dis_price_line.text.strip().split()
    if not line_split:
        dis_price_list.append('0')
    else:
        dis_price_list.append(line_split[-1])

# checking if the number of items match
print(len(dis_price_list))
print(len(original_price))
print(len(discount_list))
print(len(game_list))

50
50
50
50


In [6]:
# to calculate the discounted amount, convert original price list and discounted price list to numpy arrays and do subtraction
original_price = np.array([float(x) for x in original_price])
dis_price_list = np.array([float(x) for x in dis_price_list])

frame = {'Game':game_list, 'Original Price':original_price, 'Discounted Amount':original_price-dis_price_list,
         'Discounted Price':dis_price_list, 'Discount %':discount_list}
steam_df = pd.DataFrame(frame)
steam_df.sort_values(by='Discounted Amount', inplace=True, ascending=False)
steam_df.reset_index(drop=True, inplace=True)
steam_df.head(20)

Unnamed: 0,Game,Original Price,Discounted Amount,Discounted Price,Discount %
0,Darksiders Blades & Whip Franchise Pack,166.46,131.64,34.82,79%
1,Wolfenstein Alt History Collection,144.96,103.6,41.36,71%
2,DOOM Franchise Bundle,190.45,97.03,93.42,51%
3,Dishonored: Complete Collection,109.99,77.0,32.99,70%
4,Elder Scrolls Summer Bundle,94.97,68.0,26.97,72%
5,RAGE 2,79.99,64.0,15.99,80%
6,The Elder Scrolls V: Skyrim VR,79.99,56.0,23.99,70%
7,Fallout 4: Game of the Year Edition,79.99,56.0,23.99,70%
8,Darksiders III,79.99,53.6,26.39,67%
9,DOOM Eternal,79.99,40.0,39.99,50%


These are the 20 games that give the most value from the sale