-
Notifications
You must be signed in to change notification settings - Fork 0
/
epg_scraper.py
84 lines (70 loc) · 2.96 KB
/
epg_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import requests
import time
import os
url = "https://epgp.inflibnet.ac.in/Home/ViewSubject?catid=0d/1X9CWmyPf9Hgtlh1uyw=="
driver = webdriver.Chrome()
driver.get(url)
select = Select(driver.find_element(by=By.ID, value="Paper"))
not_downloaded = []
for i, j in zip(range(0,len(select.options)), select.options):
if i == 0:
continue
select.select_by_index(i)
time.sleep(5)
select2 = Select(driver.find_element(by=By.ID, value="Module"))
print("Created Folder: ", select.options[i].text)
if not os.path.exists(select.options[i].text):
os.mkdir(select.options[i].text)
for k, l in zip(range(0,len(select2.options)), select2.options):
if k == 0:
continue
select2.select_by_index(k)
time.sleep(5)
try:
src = driver.find_element(by=By.ID, value="ifrmet")\
.find_element(by=By.TAG_NAME, value="iframe")\
.get_attribute("src")
except:
print("Reattempting 1...")
time.sleep(5)
try:
src = driver.find_element(by=By.ID, value="ifrmet")\
.find_element(by=By.TAG_NAME, value="iframe")\
.get_attribute("src")
except:
print("Reattempting 2...")
time.sleep(5)
try:
src = driver.find_element(by=By.ID, value="ifrmet")\
.find_element(by=By.TAG_NAME, value="iframe")\
.get_attribute("src")
except:
print("Reattempting 3...")
time.sleep(10)
try:
src = driver.find_element(by=By.ID, value="ifrmet")\
.find_element(by=By.TAG_NAME, value="iframe")\
.get_attribute("src")
except:
print("Reattempting 4...")
time.sleep(10)
try:
src = driver.find_element(by=By.ID, value="ifrmet")\
.find_element(by=By.TAG_NAME, value="iframe")\
.get_attribute("src")
except:
print(f"Could not Download: \t{select2.options[k].text}\n\t\t{src}\n")
not_downloaded.append((select.options[i].text, select2.options[k].text))
continue
print(f"Downloading:\t{select2.options[k].text}\n\t\t{src}\n")
file = requests.get(src)
name = select2.options[k].text
with open(f"./{select.options[i].text}/{name}.pdf", "wb") as f:
f.write(file.content)
driver.quit()
with open("not_downloaded.txt", "w") as f:
for i in not_downloaded:
f.write(f"{i[0]}\t{i[1]}\n")