In [227]:
import requests, os, re, hashlib 
from urllib.parse import urlparse
from glob import glob
from bs4 import BeautifulSoup
from src.azurekey import *
from src.websites import *

In [290]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

def parse_url(url):
    url_temp = urlparse(url)
    netloc = url_temp.netloc
    if netloc[:4] == "www.":
        netloc = netloc[4:]
    if len(netloc) == 0:
        return None
    if url_temp.scheme == "https" or url_temp.scheme == "http":
        return url_temp.scheme + "://" + netloc + "/"
    else:
        return "https://" + netloc + "/"

def encode_url(url):
    if parse_url(url):
        return hashlib.md5(parse_url(url).encode('utf-8')).hexdigest()
    return None
    
def search(search_term, azureKey, offset=0):
    # print('searching using bing: "'+search_term+'"')
    search_url = "https://api.bing.microsoft.com/v7.0/search"
    subscription_key = azureKey
    assert subscription_key
    headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
    params  = {"q": search_term, "responseFilter": "Webpages","count":50, "offset":offset, "safeSearch":"Strict"}
    response = requests.get(search_url, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()
    return [v["url"] for v in search_results["webPages"]["value"]]

In [344]:
req = requests.get("https://www.worldometers.info/geography/alphabetical-list-of-countries/", headers=headers)
soup = BeautifulSoup(req.text)
country_list = []
for i in soup.findAll("td", {"style":"font-weight: bold; font-size:15px"}):
    country_list.append(i.text)

In [268]:
website_set = set()

In [345]:
for country in country_list:
    for j in range(3):
        search_results = search(country, azureKey, offset=j)
        for i in search_results:
            i_temp = parse_url(i)
            if i_temp not in website_tested:
                website_set.add(i_temp)

HTTPError: 403 Client Error: Quota Exceeded for url: https://api.bing.microsoft.com/v7.0/search?q=South+Sudan&responseFilter=Webpages&count=50&offset=0&safeSearch=Strict

In [346]:
len(website_set)

3278

In [354]:
print(website_set)

{'https://kramer.fr/', 'https://co.ambafrance.org/', 'https://airchina.fr/', 'https://partirou.com/', 'http://revanimal.org/', 'https://shantitravel.com/', 'https://engelvoelkers.com/', 'http://fiji.to/', 'http://antigua-barbuda.mynews.club/', 'https://andalucia.org/', 'https://thoiry.net/', 'http://panama-street.sitew.fr/', 'https://fr.ehotelsreviews.com/', 'http://bia-niger.com/', 'https://all-andorra.com/', 'https://france.tv/', 'https://monsieurtshirt.com/', 'https://bankofamerica.com/', 'http://cedcameroun.org/', 'https://mbr.mt/', 'https://en.unesco.org/', 'http://samoa.travel/', 'https://data.bnf.fr/', 'https://fr.yahoo.com/', 'https://mediclinic.ae/', 'https://eu4.paradoxwikis.com/', 'https://aljadeed.tv/', 'https://south-korea.education/', 'https://news.cgtn.com/', 'https://rugbypass.com/', 'http://ib-bank.com/', 'https://g4s.com/', 'https://cafr.ebay.ca/', 'https://usinenouvelle.com/', 'http://ems.com.cn/', 'https://guinea.opendataforafrica.org/', 'https://nh-hotels.fr/', 'ht

In [312]:
print("Webpages tested: {}, original webpages saved: {}, blured webpages saved: {}.".format(len(website_tested), len(glob("webpage_dataset/original/*.png")), len(glob("webpage_dataset/blur/*.png"))))

Webpages tested: 790, original webpages saved: 487, blured webpages saved: 467.


In [351]:
count = 0

In [357]:
from ipywidgets import IntProgress
from IPython.display import display
import time

max_count = len(website_set)
f = IntProgress(value=count, min=0, max=max_count) # instantiate the bar
display(f) # display the bar
if not os.path.exists("webpage_dataset/original/"):
    os.makedirs("webpage_dataset/original/")
if not os.path.exists("webpage_dataset/blur/"):
    os.makedirs("webpage_dataset/blur/")

for i, website in enumerate(website_set):
    if i >= count:
        req = requests.get("http://127.0.0.1:8889/?url={}&waiting=1&blur=true".format(website), headers=headers)
        count += 1
    f.value += 1

IntProgress(value=236, max=3278)

KeyboardInterrupt: 

In [372]:
img_original = glob("webpage_dataset/original/*.png")
original_set = set([os.path.split(x)[1] for x in img_original])

for i in glob("webpage_dataset/blur/*.png"):
    if os.path.split(i)[1] not in original_set:
        os.remove(i)