-
Notifications
You must be signed in to change notification settings - Fork 0
/
amznscrape.py
184 lines (159 loc) · 5.72 KB
/
amznscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# WebCrawl
# by TeeJ
# 01-26-2019
# Goal: The purpose of this program is to exploit the internet by grabbing data.
# I aim to run this throughout the day on my Raspberry Pi and collect pricing
# data on items that I am interested in. For fun.
# in order to run this script please type the following
# > python3 amznscrape.py "Data Science"
# note: "Data Science" can be replaced with any item you would like to scrape amazon with
# import packages
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import sys
import datetime as dt
# QA parameters
search_word = "Data Science"
site = "https://www.amazon.com/s/ref=nb_sb_noss_2?url=search--alias%3Daps&field-keywords=Data+Science&rh=i%3Aaps%2Ck%3AData+Science"
site = 'https://www.amazon.com/s/ref=sr_pg_3?rh=i%3Aaps%2Ck%3AData+Science&page=3&keywords=Data+Science&ie=UTF8&qid=1548565321'
# Code Below
def main():
'''
type >sudo python3 amznscrape.py {search_word}
to kick off program
output: file in data/ directory
'''
# invoke header
print('~|~|~|'*10)
print('><> '*4 + 'Running Amznscrape by TeeJ' + ' <><'*4)
print('~|~|~|'*10)
# establish search word
search_word = sys.argv[1]
final = scrape_amzn(search_word, tries=int(sys.argv[2]))
print(final)
# output as csv
final.to_csv('data/' + search_word.replace(" ", "_").lower()+'_amznscrape_' + str(dt.date.today()) + '.csv')
print()
# print('Results...')
# print(final[['Name', 'Paperback', 'Price']][final.notnull()])
# prompt everything worked
print('~|~|~|'*10)
print("Successfully ran!")
print('~|~|~|'*10)
def scrape_amzn(search_word, page=1, df=pd.DataFrame(columns = ['Name', 'Price','Recorded_On','Search_Word', 'Link']), tries=int(sys.argv[2])):
'''
input: search_word, page, dataframe
output: dataframe [Name, Price, Link] of amzn search
'''
print() # blank line for sanity
# establish link
if not page-1:
site = "https://www.amazon.com/s/ref=nb_sb_noss_2?url=search--alias%3Daps&field-keywords=" + search_word.replace(" ", "+") + "&rh=i%3Aaps%2Ck%3A" + search_word.replace(" ", "+")
else:
print("Going to page " + str(page) + "...")
site = 'https://www.amazon.com/s/ref=sr_pg_' + str(page) + '?rh=i%3Aaps%2Ck%3A' + search_word.replace(" ","+") + '&page=' + str(page) + '&keywords=' + search_word.replace(" ","+") + '&ie=UTF8'
# extract data from the site
html = requests.get(site).text
soup = BeautifulSoup(html, 'html5lib')
# search for results
print('Scraping ' + site + ' for data...')
scrape = soup.find_all('div', id=re.compile('result'))
#print('The html code has been successfully downloaded...')
# filter down results
result = []
for div in scrape:
result.extend(div.find_all('a' , {'class': re.compile('.*.')})) # '.a*normal*.'
print('Parsing the data from the webscrape for the amount and link of the item...')
# create dataframe
# df = pd.DataFrame(columns=['Name','Price', 'Link'])
df_old = df
# add data to the data frame
for line in result:
if line.find_all('span', {'class':'a-offscreen'}) and line.text.split()[0][:4] != "from":
try:
skip_dict = skip_scrape(line['href'], tries = int(sys.argv[2]))
except:
print()
print("!! Skip Scrape Failed !!")
print(line['href'])
print('The format of the html site does not match the data we are scraping for.')
print()
skip_dict = {'Failed SS': 'True'}
df = df.append({ 'Name': line['href'].split('/')[3] # update the name by splitting the link
,'Price': line.text.split()[0] # update the price
,'Recorded_On' :str(dt.date.today())
,'Search_Word' :search_word
,'Link':line['href']
, **skip_dict
}, ignore_index=True)
# print off data frame
if len(df_old.index) == len(df.index):
if tries: # retry on error (sometimes connection ends recursion)
print('Something went wrong! Retrying ' + str(tries) + ' more times...')
time.sleep(5)
return(scrape_amzn(search_word,page,df, tries-1))
else:
time.sleep(5)
return(df)
else:
print('Waiting to search for next web page...')
for i in range(4): print('+++++++++++++++++++++++++++++++++++++++++++++')
time.sleep(5)
return(scrape_amzn(search_word, page+1, df))
def skip_scrape(site, tries=100):
'''
Input: Site and number of tries
Output: Dictionary with more data
'''
# QA Inputs:
#filepath = 'data/data_science_amzn_scrape_2019-01-29.csv'
#df = pd.read_csv(filepath)
#site = df['Link'][0]
print('Attempting to find more data through a skip scrape for the following site:')
print(site)
result = []
i = 0
# pull site for specified number of tries {potential failed connection}
while result == []:
print()
print("Skip scrape try " + str(i+1))
# extract data from the site
html = requests.get(site).text
soup = BeautifulSoup(html, 'html5lib')
# print(soup)
# search for results
print('Skip Scraping ' + site + ' for data...')
scrape = soup.find_all('div', {'class': "content"})
#print('The html code has been successfully downloaded...')
# filter down results
for div in scrape:
result.extend(div.find_all('li'))
i = i + 1
if i == tries:
print()
print("!! XXXXXXXXXXXXXXXXXXXXX - ERROR - XXXXXXXXXXXXXXXXXXXXX !!")
print("Unable to grab data for " + str(site))
break
#print(result)
time.sleep(5)
# pull in the extra data
data = {'Link': site}
for i in range(5):
# clean the data
item = str(result[i]).replace('<li><b>', '').replace('</li>', '').split("</b> ")
data[item[0][:-1]] = item[1]
print("The additional data has been added by skip scrape:")
print(data)
print()
return(data)
# print('Parsing the data from the webscrape for the amount and link of the item...')
# add newly found data to data table
#df = pd.DataFrame()
#df = df.append(data, ignore_index=True)
#print(df)
#skip_scrape('dummy', 12) #test script: SUCCESS
# Execute Script Here
main()