1+ from multiprocessing import Pool
2+ import os
3+ from datetime import datetime
4+ import lxml .html as html
5+ import pandas as pd
6+ import time
7+ from selenium import webdriver
8+ from selenium .webdriver .chrome .options import Options
9+ import warnings
10+ import requests
11+ warnings .filterwarnings ("ignore" )
12+
13+ class SeleniumScraper :
14+ def __init__ (self , timeout = 10 ):
15+ self .timeout = timeout
16+ self .reqSession = requests .Session ()
17+ self .stamp = datetime .now ().strftime ("%Y-%m-%d_%H-%M-%S" )
18+ self .storagePath = os .path .join (
19+ os .path .dirname (os .path .abspath (__file__ ))
20+ )
21+
22+ self .headers = {
23+ 'authority' : 'www.amazon.com' ,
24+ 'pragma' : 'no-cache' ,
25+ 'cache-control' : 'no-cache' ,
26+ 'dnt' : '1' ,
27+ 'upgrade-insecure-requests' : '1' ,
28+ 'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36' ,
29+ 'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' ,
30+ 'sec-fetch-site' : 'none' ,
31+ 'sec-fetch-mode' : 'navigate' ,
32+ 'sec-fetch-dest' : 'document' ,
33+ 'accept-language' : 'en-GB,en-US;q=0.9,en;q=0.8' ,
34+ }
35+
36+ def fetch_request_normal (self , url , params = None ):
37+ try :
38+ headers = {
39+ "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
40+ }
41+ response = self .reqSession .get (url , headers = headers )
42+
43+ if response .status_code == 200 :
44+ return response .text
45+
46+ if response .status_code == 301 :
47+ # retry with redirect
48+ response = requests .get (response .headers ['Location' ])
49+ response .raise_for_status ()
50+ if response .status_code == 200 :
51+ return response .text
52+
53+ if response .status_code == 503 :
54+ #print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
55+ return None
56+
57+ except Exception as e :
58+ print (
59+ "Exception occurred for url: {} and exception: {}" .format (url , e )
60+ )
61+ print ("Exception occurred for url: {} and exception: {}" .format (url , e ))
62+ pass
63+ return None
64+
65+ def get_xpath_link (self , doc , xpath , website ):
66+ try :
67+ name = doc .xpath ("" .join (xpath ))
68+ for i in range (len (name )):
69+ if name [i ].startswith ("/" ):
70+ name [i ] = website + name [i ]
71+ else :
72+ name [i ] = name [i ]
73+ return name
74+
75+ except Exception as e :
76+ print ("Error in getting {}: {}" .format (name , e ))
77+ pass
78+ return None
79+ pass
80+
81+ def get_selenium_driver (self ):
82+ chrome_options = Options ()
83+ chrome_options .add_argument ("--headless" )
84+ chrome_options .add_argument ("--window-size=1920,1080" )
85+ chrome_options .add_argument ("--disable-gpu" )
86+ chrome_options .add_argument ("--no-sandbox" )
87+ chrome_options .add_argument ("--disable-dev-shm-usage" )
88+ chrome_options .add_argument ("--disable-extensions" )
89+ chrome_options .add_argument ("--disable-logging" )
90+ chrome_options .add_argument ("--log-level=3" )
91+ chrome_options .add_argument ("--silent" )
92+ chrome_options .add_argument ("--blink-settings=imagesEnabled=false" )
93+ driver = webdriver .Chrome (chrome_options = chrome_options )
94+ return driver
95+
96+ def fetch_request_selenium (self , url , waiting_time = 1 ):
97+ try :
98+ driver = self .get_selenium_driver ()
99+ driver .get (url )
100+ time .sleep (waiting_time )
101+ doc = html .fromstring (driver .page_source )
102+ driver .close ()
103+ return doc
104+
105+ except Exception as e :
106+ print (
107+ "Exception occurred for url: {} and exception: {}" .format (url , e )
108+ )
109+ pass
110+
111+ def get_xpath_data (self , doc , xpath ):
112+ try :
113+ name = doc .xpath (xpath )
114+ return name
115+
116+ except Exception as e :
117+ print ("Error in getting {}: {}" .format (name , e ))
118+ pass
119+ return None
120+
121+ def slow_page_scroll (self , driver , speed ):
122+ current_scroll_position = driver .execute_script ("return window.pageYOffset;" )
123+ while current_scroll_position < driver .execute_script (
124+ "return document.body.scrollHeight;"
125+ ):
126+ driver .execute_script (
127+ "window.scrollTo(0, arguments[0]);" , current_scroll_position
128+ )
129+ current_scroll_position += 1000
130+ time .sleep (speed )
131+
132+ def data_storage (self , df_list , unique_id , name , storageFormat , storagePath = None ):
133+ df_combined = pd .concat (df_list , ignore_index = True )
134+ df_combined .drop_duplicates (subset = unique_id , inplace = True )
135+ if storageFormat == "csv" :
136+ df_combined .to_csv (
137+ self .storagePath + "/{}_{}.csv" .format (name , self .stamp ),
138+ index = False ,
139+ )
140+ elif storageFormat == "json" :
141+ df_combined .to_json (
142+ self .storagePath + "/{}_{}.json" .format (name , self .stamp ),
143+ orient = "records" ,
144+ )
145+
146+ def cleanData (self , array ):
147+ array = [x .strip () for x in array ]
148+ array = list (filter (None , array ))
149+ array = [x .encode ("ascii" , "ignore" ).decode () for x in array ]
150+ array = [x .replace ("\n " , "" ) for x in array ]
151+ return array
152+
153+
0 commit comments