### Imports

In [1]:
import bs4
import urllib3
from bs4 import BeautifulSoup as soup
import re
import csv

### Setting the url for scrapping 

In [2]:
myurl = 'http://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'

### Creating pool manager for managing http requests

In [3]:
http = urllib3.PoolManager()

In [4]:
# Get request to extract the contents from the website
pg = http.request(url = myurl, method='GET')



### Beautiful Soup object for parsing the web page 

In [5]:
page_soup = soup(pg.data, 'lxml')

In [6]:
# Getting the header information
page_soup.h1.string

'Video Cards & Video Devices'

In [7]:
# Printing the intial paragraph on the web page
page_soup.p.string

'Newegg.com - A great place to buy computers, computer parts, electronics, software, accessories, and DVDs online. With great prices, fast shipping, and top-rated customer service - once you know, you Newegg.'

In [8]:
#grabs each product
containers = page_soup.find_all('div', {'class':'item-container'})

In [9]:
# Check number of products obtained in the response
len(containers)

12

### Sample an item in the list and extract it's information

In [10]:
container = containers[0]

** Brand **

In [11]:
# Brand for the product
container.div.div.a.img['title']

'MSI'

** Product Name **

In [12]:
# Product Name
title_container = container.find_all('a', {'class':'item-title'})
title_container[0].text

'MSI GeForce GTX 1080 Ti DirectX 12 GTX 1080 Ti SEA HAWK X Video Card'

** Product Price **

In [13]:
price_container = container.find_all('li', {'class':'price-current'})
price = price_container[0].text.strip().replace('\n', '')
price

'|$749.99\xa0\r            –'

In [14]:
matcher = re.search(pattern=r'(\$[0-9]+(\.[0-9]{2})?)', string=price)
if matcher:
    print(matcher.group())

$749.99


### Loop through all containers to get the product information

In [15]:
data = []
for container in containers:
    brand = container.div.div.a.img['title']
    title_container = container.find_all('a', {'class':'item-title'})
    product_name = title_container[0].text
    shippinng_container = container.find_all('li', {'class':'price-ship'})
    shipping = shippinng_container[0].text.strip()
    
    price_container = container.find_all('li', {'class':'price-current'})
    price_string = price_container[0].text.strip().replace('\n', '')
    matcher = re.search(pattern=r'(\$[0-9]+(\.[0-9]{2})?)', string=price)
    if matcher:
        price = matcher.group()
    else:
        price = price_string
        
    data.append((brand, product_name, price, shipping))  

** Write the information to csv file **

In [16]:

# open a csv file with append, so old data will not be erased
with open('products.csv', 'a') as csv_file:
 writer = csv.writer(csv_file)
 # The for loop
 for brand, name, price, ship in data:
     writer.writerow([brand, name, price, shipping])

### Verifying the data extracted from the website

In [17]:
import pandas as pd
data = pd.read_csv('products.csv', names=['Brand', 'Product', 'Price', 'Shipping'])

In [18]:
data

Unnamed: 0,Brand,Product,Price,Shipping
0,MSI,MSI GeForce GTX 1080 Ti DirectX 12 GTX 1080 Ti...,$249.99,$4.99 Shipping
1,ASUS,ASUS ROG GeForce GTX 1070 STRIX-GTX1070-O8G-GA...,$249.99,$4.99 Shipping
2,EVGA,EVGA GeForce GTX 1060 GAMING DirectX 12 06G-P4...,$249.99,$4.99 Shipping
3,GIGABYTE,GIGABYTE GeForce GTX 1070 DirectX 12 GV-N1070G...,$249.99,$4.99 Shipping
4,ZOTAC,"ZOTAC GeForce GTX 1060 AMP!, ZT-P10600B-10M, 6...",$249.99,$4.99 Shipping
5,XFX,XFX Radeon RX 580 DirectX 12 RX-580P427D6 GTS ...,$249.99,$4.99 Shipping
6,EVGA,EVGA GeForce GTX 1070 Ti SC GAMING 08G-P4-5671...,$249.99,$4.99 Shipping
7,MSI,MSI Radeon RX 570 DirectX 12 RX 570 GAMING X 4...,$249.99,$4.99 Shipping
8,GIGABYTE,"GIGABYTE GeForce GTX 1080 Ti Turbo 11GD, GV-N1...",$249.99,$4.99 Shipping
9,ZOTAC,ZOTAC GeForce GTX 1080 Ti AMP Extreme Core 11G...,$249.99,$4.99 Shipping
