# scraping projecthub.arduino.cc

In [1]:
link = "https://projecthub-api.arduino.cc/api/v1/projects/?category=&difficulty=&name=&ordering=trending&page=1&page_size=60&tag=&type="

In [13]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import os
import json

## Get projects link

In [5]:
### Define a function that return project page inside the link
def get_projects_link(page_num,
              page_size=100):
  link = f"https://projecthub-api.arduino.cc/api/v1/projects/?category=&difficulty=&name=&ordering=trending&page={page_num}&page_size={page_size}&tag=&type="

  # Get the html page
  html = requests.get(link)
  # parse it
  soup = BeautifulSoup(html.content, "html.parser")

  # turn it in to json
  data = json.loads((soup).text)

  data_content = data["data"]["results"]

  domain = "https://projecthub.arduino.cc/"
  sub_links = []
  count = 0
  for i in range(len(data_content)):
    # Get the username
    username = data_content[i]["posted_by"]
    if username is not None:
      count +=1
      username = username["username"]
    else:
      continue
    # print(i,data_content[i])

    # Get the slug
    slug = data_content[i]['slug']

    # Build the link
    sub_link = domain + username + "/" + slug
    sub_links.append(sub_link)
  print(count)
  return sub_links

In [7]:
sub_links = get_projects_link(page_num=1,
                              page_size=60)
sub_links[:10]

60


['https://projecthub.arduino.cc/rebos/airfuel-controller-for-lpg-engines-fa17bd',
 'https://projecthub.arduino.cc/me_rk/diy-automatic-plant-watering-system-no-microcontroller-needed-ad1e54',
 'https://projecthub.arduino.cc/Aswinth/soil-moisture-sensor-with-arduino-91c818',
 'https://projecthub.arduino.cc/AndreaRichetta/low-power-library-for-arduino-c33-6a7c6a',
 'https://projecthub.arduino.cc/petros_mpla/helios-cube-80a316',
 'https://projecthub.arduino.cc/metehocax/arduino-plug-and-make-kit-all-modulino-demo-99ad53',
 'https://projecthub.arduino.cc/lucasfernando/ultrasonic-sensor-with-arduino-complete-guide-284faf',
 'https://projecthub.arduino.cc/mad_mcu/how-to-play-rock-paper-scissor-with-a-time-of-flight-sensor-3d27ec',
 'https://projecthub.arduino.cc/lee_curiosity/a-beginners-guide-controlling-an-arduino-car-with-a-remote-5d4418',
 'https://projecthub.arduino.cc/angadiameya007/bluetooth-controlled-car-with-hc-05-module-e90493']

## Get all images link and save them to a file

In [8]:
def get_image_urls_from_page(link):
  html = requests.get(link)
  soup = BeautifulSoup(html.content, "html.parser")

  data = json.loads(soup.find("script", attrs={'id':"__NEXT_DATA__"}).text)

  image_urls = []

  data = data["props"]["pageProps"]["data"]

  if data["project_description"] != None:
    s = BeautifulSoup(data["project_description"], "html.parser")

    imgs = s.find_all('img')

    if len(imgs) > 0:
      for item in s.find_all("img"):
        if ".blob" not in item['src'] and ".gif" not in item['src']:
          image_urls.append(item['src'])

    elif len(data["attachments"]) > 0:
      if ".blob" not in data["attachments"][0]['url'] and ".gif" not in data["attachments"][0]['url']:
        image_urls.append(data["attachments"][0]['url'])

  elif len(data["attachments"]) > 0:
    if ".blob" not in data["attachments"][0]['url'] and ".gif" not in data["attachments"][0]['url']:
      image_urls.append(data["attachments"][0]['url'])


  return image_urls

In [9]:
image_urls = []
for link in sub_links:
  image_urls += get_image_urls_from_page(link)

In [10]:
image_urls[:10]

['https://projects.arduinocontent.cc/c1aed6af-2ead-406b-a415-406478af75e9.jpg',
 'https://projects.arduinocontent.cc/21317194-5a96-46a1-9eba-fd9e3c68884d.jpg',
 'https://projects.arduinocontent.cc/8d831dad-f49c-444b-95b7-9fde29d99d66.jpg',
 'https://projects.arduinocontent.cc/810c52aa-5ea2-4e09-80cb-483b416fb71e.jpg',
 'https://projects.arduinocontent.cc/246c7991-9d1a-4c21-afe7-32c88a8ffe52.jpg',
 'https://projects.arduinocontent.cc/ba95fd76-828a-4e45-a59e-6dac0a1112bb.jpg',
 'https://circuitdigest.com/sites/default/files/inlineimages/u5/circuit-diagram-of-plant-watering.jpg',
 'https://circuitdigest.com/sites/default/files/inlineimages/u5/components-of-plant-watering.jpg',
 'https://circuitdigest.com/sites/default/files/inlineimages/u5/plant-watering-setup.jpg',
 'https://circuitdigest.com/sites/default/files/inlineimages/u4/Soil-Moisture-Sensor-Pinout.png']

In [None]:
from tqdm import tqdm


number_of_pages = 55
page_size = 100
links = []
for num in tqdm(range(number_of_pages)):
  links += get_projects_link(page_num=num+1,
                            page_size=page_size)

In [11]:
def save_lines(filename,
              lines,
              mode='w'):
  with open(filename, mode) as f:
    for line in lines:
      f.write(line+"\n")
    f.close()

def get_lines(filename):
  with open(filename, 'r') as f:
    return f.readlines()

In [None]:
for link_num in tqdm(range(0, len(links))):
  link = get_image_urls_from_page(links[link_num].strip())
  save_lines(filename="/content/drive/MyDrive/scraping_arduino/image_links.txt",
             lines=link,
             mode="a")

In [None]:
image_urls = get_lines("/content/drive/MyDrive/scraping_arduino/image_links.txt")

## Download images

In [15]:
dir = "/content/images/"
count = 0

for i, url in tqdm(enumerate(image_urls)):
  url = url.strip()
  count += 1
  if url == "/":
    continue


  _, image_extension = os.path.splitext(url)
  image = requests.get(url)
  image_path = dir+str(count)+image_extension

  with open(image_path, "wb") as f:
    f.write(image.content)
    f.close()



281it [00:35,  7.90it/s]
