-
Notifications
You must be signed in to change notification settings - Fork 3
/
web_scraping.py
121 lines (82 loc) · 3.63 KB
/
web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import time
import requests
import random
from bs4 import BeautifulSoup as webpage_parser
user_agent_list = [
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/57.0.2987.110 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.79 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) '
'Gecko/20100101 '
'Firefox/55.0'), # firefox
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.91 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/62.0.3202.89 '
'Safari/537.36'), # chrome
('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.108 '
'Safari/537.36'), # chrome
]
# Pick a random user agent
user_agent = random.choice(user_agent_list)
headers = {"User-Agent": user_agent,
'Accept': 'text/html',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def lotto():
lotto_site = "https://www.nationallottery.co.za/results/lotto"
print(f'{lotto_site} Web Scrapper!')
source = requests.get(lotto_site, headers=headers)
print(source)
soup = webpage_parser(source.text, "lxml")
draw_id = soup.find('div', class_='blockWrap resDetailView').find('div', class_='title')
match = soup.find('div', class_='headerBox clearFix')
innerHeaderBlock = match.find('div', class_="w50 fl").find('div', class_="innerHeaderBlock")
titleHead = innerHeaderBlock.find('div', class_="titleHead")
resultBalls = innerHeaderBlock.find('div', class_="resultBalls")
date = innerHeaderBlock.find('div', class_="dateWrap").find("span", class_="date")
print(f'{draw_id.text} {titleHead.text} date {date.text} : {resultBalls.text}')
return draw_id, titleHead, date, resultBalls
def lotto_history_columns():
lotto_site = "https://www.nationallottery.co.za/lotto-history"
print(f'{lotto_site} Web Scrapper!')
source = requests.get(lotto_site, headers=headers).text
soup = webpage_parser(source, "lxml")
tableRow = soup.find('div', class_='tableHead').find('div', class_='tableRow')
col1 = tableRow.find('div', class_='col col1')
gameDrawIDCol = col1.find('div', class_='inlineGroup').find('div', class_='labelName').text
col2 = tableRow.find('div', class_='col col2')
gameDateCol = col2.find('div', class_="labelName").text
col3 = tableRow.find('div', class_='col col3')
gameTypeCol = col3.find("div", class_="labelName").text
col4 = tableRow.find('div', class_='col col4')
gameWinningNumbersCol = col4.find("div", class_="labelName").text
print(gameDrawIDCol, ",", gameDateCol, ",", gameTypeCol, ",", gameWinningNumbersCol)
return gameDrawIDCol, gameDateCol, gameTypeCol, gameWinningNumbersCol
# match = soup.find('div', class_='headerBox clearFix')
def lotto_history_row():
lotto_site = "https://www.nationallottery.co.za/lotto-history"
print(f'{lotto_site} Web Scrapper!')
source = requests.get(lotto_site, headers=headers)
soup = webpage_parser(source.content, "html5lib")
time.sleep(2)
tableBody = soup.find('div', class_='tableWrap')#.find('div', class_='tableBody')
print(tableBody.prettify())
# row_data = []
# for tableRow in tableBody.findAll('div', class_='tableRow'):
# print(tableRow.prettify())
pass
lotto_history_row()