## Data Scraping from web
- web: Hacker News (https://news.ycombinator.com/)
- extract links with votes more than 100

In [None]:
# install required packages
!pip install beautifulsoup4
!pip install requests

In [51]:
# import the library
from bs4 import BeautifulSoup
import requests

In [59]:
# define a function that parse the html file and search for required class attributes
def html_parser(url, class_att):
  # get response from the target url
  res = requests.get(url)

  # use BeautifulSoup to parse the html file
  soup = BeautifulSoup(res.text, 'html.parser')

  # use the select method to get the desired class
  links = soup.select(class_att) 

  return links

In [60]:
# define a function that sort the list of dictionaries
def sort_dict(ld, col_att, ascending = True):
  """
  Function that sort a list of dictionary, ld based on the required column attribute, col_att.
  : param ld: list of dictionaries needed to be sorted
  : param col_att: column attribute as a key to do the sorting
  : ascending: default = True, set to False to do descending sort
  : return: list of dictionaries sorted based on requirement
  """
  return sorted(ld, key= lambda x: x[col_att], reverse = not (ascending))

In [86]:
# define a function that extract the needed data
def create_custom_hn(links, subtext):
    """
    Function that gets extract the links, title, and the scores
    :param links: list of links with title extracted from web
    :param subtext: list of scores of from subtext extracted from web
    :return: dictionary with the title and its respective url
    """
    hn = []
    for idx, item in enumerate(links):
        title = item.a.getText()
        href = item.a.get('href')
        vote = subtext[idx].select('.score')
        if len(vote):
          score = int(vote[0].getText().replace(' points',""))
          if score >= 100:
            hn.append({'title': title, 'link': href, 'score': score})
    return hn

In [100]:
import time

# main execution of files to get whole list of links from hn
url_main = 'https://news.ycombinator.com/'
url = 'https://news.ycombinator.com/'
hn = []

loop_is_on = True

while loop_is_on:
  # get list of links under the class of titleline
  links = html_parser(url, ".titleline")

  # get list of points of vote under the class of score (we first extract the subtext as there are occurences of no vote)
  subtext = html_parser(url, ".subtext")

  # get the url for next page which is under class of morelink
  morelink = html_parser(url, ".morelink")

  # extract data and put into list of dictionaries
  hn.extend(create_custom_hn(links, subtext))

  # if next page exist, continue the loop
  if len(morelink):
    url = url_main + morelink[0].get('href')
  else:
    sort_dict(hn, 'score', False)
    loop_is_on = False

  # to avoid too frequent scraping from the page causing bad effect we put on hold 5 secs for every scrape
  time.sleep(5)

hn = sort_dict(hn, 'score', False)

In [101]:
# put the data into one dataframe for better view
import pandas as pd

hn_table = pd.DataFrame.from_dict(hn)
hn_table

Unnamed: 0,title,link,score
0,OpenAI is now everything it promised not to be...,https://www.vice.com/en/article/5d3naz/openai-...,1570
1,Introducing ChatGPT and Whisper APIs,https://openai.com/blog/introducing-chatgpt-an...,1336
2,Jailbreak Chat: A collection of ChatGPT jailbr...,https://www.jailbreakchat.com,1099
3,The Camera-Shy Hoodie,https://www.macpierce.com/the-camera-shy-hoodie,775
4,Keep your AI claims in check,https://www.ftc.gov/business-guidance/blog/202...,754
...,...,...,...
171,Make JDK source code UTF-8,https://bugs.openjdk.org/browse/JDK-8301971,102
172,Hacking with Style: TrueType VT220 Font (2009),http://sensi.org/~svo/glasstty/,101
173,Who rules Earth? Wild mammals far outweighed b...,https://www.science.org/content/article/who-ru...,101
174,W3C Beta Website,https://beta.w3.org/,101
