-
Notifications
You must be signed in to change notification settings - Fork 0
/
imgur_taginfo_to_url_list.py
92 lines (76 loc) · 3.15 KB
/
imgur_taginfo_to_url_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Script to extract Imgur URLs from OpenStreetMap taginfo data."""
import requests
import re
import os
from datetime import datetime
def download_json(url):
"""Download JSON data from a URL."""
response = requests.get(url, timeout=10)
if response.status_code == 200:
return response.json()
else:
raise requests.RequestException(f"Failed to download JSON from {url}")
def process_data(data):
"""Process data to extract Imgur URLs."""
imgur_urls = []
for item in data:
if 'imgur' in item['value']:
urls = re.findall(
r'(?:https?://)?(?:www\.)?(?:web\.archive\.org\/web\/\d{14}\/)?(i\.imgur\.com/[\w./?=&%-]+)',
item['value']
)
for url in urls:
full_url = f'https://{url}'
full_url = re.sub(r'(\?.*)', '', full_url) # Remove URL parameters
full_url = re.sub(r'(_d\.webp)', '.webp', full_url) # Replace "_d.webp" with ".webp"
imgur_urls.append(full_url)
return imgur_urls
def merge_and_deduplicate(new_urls, recent_filename):
"""Merge and deduplicate Imgur URLs with the recent.txt file."""
if os.path.exists(recent_filename):
with open(recent_filename, 'r', encoding='utf-8') as file:
existing_urls = file.read().splitlines()
else:
existing_urls = []
all_urls = list(set(existing_urls + new_urls)) # Merge and remove duplicates
all_urls.sort() # Optional: Sort the URLs
return all_urls
def save_imgur_urls(imgur_urls, output_directory, base_filename):
"""Save the extracted Imgur URLs to a text file and update recent.txt."""
today = datetime.now().strftime("%Y %m %d")
output_filename = os.path.join(output_directory, f"{base_filename} as of {today}.txt")
recent_filename = os.path.join(output_directory, "recent.txt")
# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)
all_urls = merge_and_deduplicate(imgur_urls, recent_filename)
# Save to the timestamped output file
with open(output_filename, 'w', encoding='utf-8') as output_file:
for url in all_urls:
output_file.write(f"{url}\n")
# Overwrite recent.txt with the new data
with open(recent_filename, 'w', encoding='utf-8') as recent_file:
for url in all_urls:
recent_file.write(f"{url}\n")
def main():
"""Main function to run the script."""
base_url = (
"https://taginfo.openstreetmap.org/api/4/search/by_value?"
"query=imgur&sortname=count_all&sortorder=desc&rp=999&page="
)
output_directory = "URL lists"
base_filename = "all IMGUR urls"
imgur_urls = []
page = 1
while True:
print(f"Processing Taginfo results page {page}...")
url = f"{base_url}{page}"
json_data = download_json(url)
imgur_urls.extend(process_data(json_data["data"]))
if json_data["page"] * json_data["rp"] >= json_data["total"]:
break
page += 1
print(f"Saving Imgur URLs...")
save_imgur_urls(imgur_urls, output_directory, base_filename)
print("Done! Imgur URLs updated.")
if __name__ == "__main__":
main()