In [7]:
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

In [8]:
%%capture
#Scrape the sitemap of phonedb.net
response = requests.get('https://phonedb.net/sitemap/')
soup = BeautifulSoup(response.content, 'html.parser')

In [9]:
#regex filter to select particular phone brands
brand_filter = "(iphone.*cn)|(huawei(?!.*pad).*cn)|(samsung.*cn)|(xiaomi.*cn)"
#brand_filter = "iphone.([8-9]|1[1-4]|x).*cn"
#brand_filter = "huawei..*cn"

url_locs = soup.find_all("loc", string=re.compile(brand_filter))
print(len(url_locs))

1067


In [10]:
#check there is no pad device, we only want phone devices
for loc in url_locs:
    #print(loc.text)
    if "pad" in loc.text:
        print("!!!")
    # else:
    #     print(loc.text)
    

In [11]:
#Scrape specific specs pages listed in the sitemap urls 
import time
def scrape(url_locs):
    phone_soups = []
    for loc in tqdm(url_locs):
        #print(loc.text)
        url = loc.text + "&d=detailed_specs#section14"
        phone_soups.append(BeautifulSoup(requests.get(url).content, 'html.parser'))
    time.sleep(5)
    return phone_soups

In [None]:
#divide scrape tasks over different cells to avoid 
# having to restart from the begining when exceptions 
# occurred due to internet connection issues  

# try multithreaded scraping or running several scrape
#  scripts if number of phone models > 2000

# crawler speed: about 3min per 100 phone models 

In [12]:
phone_soups = []

In [13]:
phone_soups += scrape(url_locs[0:200])

100%|██████████| 200/200 [03:14<00:00,  1.03it/s]


In [14]:
phone_soups += scrape(url_locs[200:400])

100%|██████████| 200/200 [03:09<00:00,  1.06it/s]


In [15]:
phone_soups += scrape(url_locs[400:600])

100%|██████████| 200/200 [03:07<00:00,  1.07it/s]


In [16]:
phone_soups += scrape(url_locs[600:800])

100%|██████████| 200/200 [03:07<00:00,  1.07it/s]


In [17]:
phone_soups += scrape(url_locs[800:1000])

100%|██████████| 200/200 [03:05<00:00,  1.08it/s]


In [18]:
phone_soups += scrape(url_locs[1000:])

100%|██████████| 67/67 [01:01<00:00,  1.09it/s]


In [19]:
print("Number of phone models: ", len(phone_soups))

Number of phone models:  1067


In [20]:
#parse html page and convert it to structured text
def parse_phonedb_html(phone_soup): 
    table_soup = phone_soup.find("table")
    table = []
    for trs in table_soup.find_all("tr"):
        tds = trs.find_all("td")
        if len(tds) == 1:
            col = tds[0].text.replace("\n", "").replace("\xa0", "")
            if re.search("Brief.*",col):
                table.append(["Brief Info", col.replace("Brief", "")])
        if len(tds) == 2:
            label_col = tds[0].text.replace("\n", "").replace("\xa0", "").replace(":", "")
            field_col = tds[1].text.replace("\n", "").replace("\xa0", "")
            #print(label_col, ": ", field_col)
            if re.search("^[^a-zA-Z]*$", label_col):
                #print("*****")
                table[-1][1] += ", " + field_col
            else:
                table.append([label_col, field_col])
    return table

def table_to_text(table):
    text = ""
    for row in table:
        text += row[0] + ": " + row[1] + '\n'
    return text

# for row in parse_phonedb_html(phone_soups[2]):
#     print(row[0], ": ", row[1])

In [21]:
#build dataset as dict
phonedb_dataset = {}

for phone_soup in phone_soups:
    table = parse_phonedb_html(phone_soup)
    model_name = table[0][1] + " " + table[1][1]
    phonedb_dataset[model_name] = [table_to_text(table).replace("\n", " \\n ")]

#print("\n".join(phonedb_dataset.keys()))
#print(phonedb_dataset[list(phonedb_dataset.keys())[1]][0])

In [22]:
#print(phonedb_dataset)

In [23]:
#convert to json
import json
with open('phonedb_dataset.json', 'w') as f:
    json.dump(phonedb_dataset, f)