# Web Scrape

### Import Libraby & Init url

In [4]:
import requests
import urllib.request
import time
import pandas as pd
from bs4 import BeautifulSoup

# display progress bar (tqdm>=4.23.4 | pandas==0.24.0)
from tqdm import tqdm_notebook as tqdm 

In [97]:
# init request
baseUrl = "http://www.it.kmitl.ac.th/~teerapong/news_archive"
homeUrl = "http://www.it.kmitl.ac.th/~teerapong/news_archive/index.html"
response = requests.get(homeUrl)

### Extract Month URL

In [114]:
# parse html
soup = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")

In [115]:
# create month url
li_group = soup.findAll('li')
month_url_group = [f"{baseUrl}/{li.find('a')['href']}" for li in li_group]

### Extract Article Properties 

In [135]:
# use url each month to fetch article
article_category = []
article_title = []
article_url = []

# loop each month
for month in month_url_group:
    
    # init soup
    month_resp = requests.get(month);
    soup = BeautifulSoup(month_resp.text, "html.parser", from_encoding="utf-8")
    
    # append category
    category_group = soup.findAll('td', {'class': 'category'});
    for category in category_group:
        # article category not available -> skip
        if category.getText().strip() == "N/A": continue
        article_category.append(category.getText().strip())
        
    # append title & url
    title_group = soup.findAll('td', {'class': 'title'});
    for title in title_group:
        # article title not available -> skip
        if title.getText().strip() == "Article no longer available in archive": continue
        article_title.append(title.getText().strip())
        article_url.append(f"{baseUrl}/{title.find('a')['href']}")

        
# display article properties length (check length is match)
print("Category: " + str(len(article_category)))
print("Title: " + str(len(article_title)))
print("Url: " + str(len(article_url)))

Category: 1408
Title: 1408
Url: 1408


### Extract Article Content

In [134]:
# use articule url to fetch article content
article_content = []

# init progress bar
with tqdm(total=len(my_list)) as pbar:
    
    # loop each article
    for article in article_url[:100]:

        article_resp = requests.get(article);
        soup = BeautifulSoup(article_resp.text, "html.parser", from_encoding="utf-8")
        
        current_content = []
        
        article_group = soup.findAll('p')
        for content in article_group[:-1]:

            # check <p> is empty?
            if (content.text == ""): continue
            current_content.append(content.text.rstrip("\n\r"))

        # join each <p> to raw string and append to article_content
        article_content.append(''.join(current_content))
        
        # update progress bar
        pbar.update(1)

# display article content length
print("Content: " + str(len(article_content)))

HBox(children=(IntProgress(value=0), HTML(value='')))


Content: 100


### Write Files

In [140]:
# write article_title.txt
with open("./datastore/article_title.txt", "w", encoding="utf-8") as file:
    for row in article_title:
        file.write("%s\n" % row)
    file.close()
    
# write article_content.txt
with open("./datastore/article_content.txt", "w", encoding="utf-8") as file:
    for row in article_content:
        file.write("%s\n" % row)
    file.close()

### Read File

In [145]:
# read article_title.txt
with open("./datastore/article_title.txt", "r", encoding="utf-8") as file:
    title_raw = file.read().splitlines()
    file.close()
    
# read article_title.txt
with open("./datastore/article_content.txt", "r", encoding="utf-8") as file:
    content_raw = file.read().splitlines()
    file.close()

In [150]:
# init article DataFrame
df_title = pd.DataFrame(title_raw)
df_content = pd.DataFrame(content_raw)

In [151]:
print(df_title)

                                                      0
0     21st-Century Sports: How Digital Technology Is...
1                      Asian quake hits European shares
2                        BT offers free net phone calls
3                     Barclays shares up on merger talk
4                      Barkley fit for match in Ireland
5                                Bellamy under new fire
6                     Benitez 'to launch Morientes bid'
7                     Benitez delight after crucial win
8                           Big war games battle it out
9                     British Library gets wireless net
10                    Brizzel to run AAA's in Sheffield
11                      Bush budget seeks deep cutbacks
12                       Bush to get 'tough' on deficit
13                         Cable offers video-on-demand
14                     Cabs collect mountain of mobiles
15                       Camera phones are 'must-haves'
16                      Card fraudsters 'targeti