In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pythainlp

In [None]:
import pythainlp
from pythainlp import word_tokenize

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib
import os
import re
import urllib.request
import json
from tqdm import tqdm

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.firefox import GeckoDriverManager

In [None]:
def preprocess_sentence(x):
    output = word_tokenize(x)
    for i in reversed(range(len(output))):
        if output[i].strip() == "":
            del output[i]
        else:
            output[i] = output[i].strip()
    return output

In [None]:
def preprocess_list(x):
    output = []
    x = x[:-4]
    for sentence in x:
        output += preprocess_sentence(sentence)
    return output

In [None]:
def process_data(file_path, out_path):
    df = pd.read_json(path_or_buf=file_path, lines=True)
    
    df["Title"] = df["Title"].apply(preprocess_sentence)
    df["Detail"] = df["Detail"].apply(preprocess_list)

    df["Document Tag"] = "Fact News"

    df['json'] = df.apply(lambda x: x.to_json(), axis=1)
    
    file_data = open(out_path, "w", encoding="utf8")

    for index, row in df.iterrows():
        file_data.write(row["json"]+"\n")  

    file_data.close()

## kapook

In [None]:
def scrape_kapook_url():
    base_url = "https://health.kapook.com/news"
    
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.add_argument('--headless')
    firefox_options.add_argument('--no-sandbox')
    firefox_options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=firefox_options)
    driver.get(base_url)

    loadmore = driver.find_element(by=By.XPATH , value="//a[@id='loadmore']")
    j = 0

    while loadmore.is_displayed():
        try:
            loadmore.click()
            time.sleep(0.5)
            if j%100 == 0:
                print("-- {} --".format(j))
                f = open("kapook_url.txt", "a")
                html_source = driver.page_source
                soup = BeautifulSoup(html_source, 'html.parser')
                soup = soup.find("ul", {"class": "hits2"})
                blogs = soup.findAll("a")

                for blog in blogs:
                    url = blog["href"]
                    f.write(url.strip()+"\n")
                f.close()  

            j += 1
            
            loadmore = driver.find_element(by=By.XPATH , value="//a[@id='loadmore']")

        except Exception as e:
            print(e)
    
    try:
        html_source = driver.page_source
        driver.quit()
        soup = BeautifulSoup(html_source, 'html.parser')
        soup = soup.find("ul", {"class": "hits2"})
        blogs = soup.findAll("a")
        print(len(urls))
        f = open("kapook_url_done.txt", "a")             
        for blog in blogs:
            url = blog["href"]
            f.write(url.strip()+"\n")
        f.close()
    
    except:
        html_source = driver.page_source
        driver.quit()
        f.write(html_source)    

In [None]:
def clean_duplicate_kapook(file_url):
    url = set()
    f = open(file_url, "r")
    for line in f:
        line = line.strip()
        
        if line == "":
            continue
            
        _url = line.split("view")[1][:-5]
        url.add(_url)
    f.close()
    
    print("There are {} urls".format(len(url)))
    
    out = open("out_url_process.txt", "a")
    for _url in url:
        out.write(_url + "\n")
    out.close()
        
    return url

In [None]:
def scrape_kapook_data():
    file_url = open("/content/drive/MyDrive/Pattern/out_url_process.txt", "r", encoding="utf8")
    file_data = open("/content/drive/MyDrive/Pattern/kapook_data.json", "w", encoding="utf8")
    base_url = "https://health.kapook.com/view{}.html"
    
    for url in tqdm(file_url):
        try:
            url = url.strip()
            _res = urllib.request.urlopen(base_url.format(url))
            res = _res.read()
            res = res.decode("utf8")
            soup = BeautifulSoup(res, 'html.parser')
            _res.close()
    
            title = soup.find("h1", {"itemprop": "headline"})
            title = title.get_text().strip()    
                              
            content = []
            soup = soup.find("div", {"class": "content"})
            
            for s in soup("a"):
                s.extract()

            _content = soup.get_text()

            for _ in _content.split(" "):
                _text = _.strip()
                if _text == "":
                    continue
                elif "เรียบเรียงข้อมูลโดย"in _text or "ขอขอบคุณภาพประกอบจาก" in _text:
                    continue
                elif "อ่านรายละเอียดเพิ่มเติมจาก" in _text:
                    _text = _text[:_text.index("อ่านรายละเอียดเพิ่มเติมจาก")]
                    content.append(_text)
                else:
                    content.append(_text) 

            data = {
                "Title": title,
                "Detail": content
            }

            json_string = json.dumps(data, ensure_ascii=False)
            
            file_data.write(json_string+"\n")   
        
        except Exception as e:
            print("\n")
            print(url)
            print(e)
            continue
        
    file_data.close()

In [None]:
scrape_kapook_data()

In [None]:
file_path = "/content/drive/MyDrive/Pattern/kapook_data.json"
out_path = "/content/drive/MyDrive/Pattern/dataset/kapook_dataset.json"
# process_data(file_path, out_path)

In [None]:
check = pd.read_json(path_or_buf=out_path, lines=True)
check.describe()

## sanook

In [None]:
def scrape_sanook_url():
    base_url = "https://www.sanook.com/health/"
    
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.add_argument('--headless')
    firefox_options.add_argument('--no-sandbox')
    firefox_options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=firefox_options)
    driver.get(base_url)

    driver.find_element(by=By.XPATH , value="//button[@class='jsx-1854747484 button']").click()
    loadmore = driver.find_element(by=By.XPATH , value="//button[@class='jsx-3493116903 bg-color-health pagination typeDefault']")
    j = 0

    while loadmore.is_displayed():
        try:
            loadmore.click()
            time.sleep(0.5)
            if j%100 == 0:
                f = open("sanook_url_py.txt", "a")
                f.write("\n")
                html_source = driver.page_source
                soup = BeautifulSoup(html_source, 'html.parser')
                urls = soup.findAll("a", {"class": "jsx-1104899621 EntryListImage"})
                print("-- {} --".format(j))
                print(len(urls))

                for _url in urls:
                    url = _url["href"]
                    f.write(url.strip()+"\n")
                f.close()  

            j += 1
            
            loadmore = driver.find_element(by=By.XPATH , value="//button[@class='jsx-3493116903 bg-color-health pagination typeDefault']")

        except Exception as e:
            print(e)
            break
    
    try:
        html_source = driver.page_source
        driver.quit()
        soup = BeautifulSoup(html_source, 'html.parser')
        
        urls = soup.findAll("a", {"class": "jsx-1104899621 EntryListImage"})
        print(len(urls))
        f = open("sanook_url_done_py.txt", "a")  
        f.write("\n")           
        for _url in urls:
            url = _url["href"]
            f.write(url.strip()+"\n")
        f.close()
    
    except:
        html_source = driver.page_source
        driver.quit()  

In [None]:
def clean_duplicate(file_url):
    url = set()
    f = open(file_url, "r")
    for line in f:
        line = line.strip()
        
        if line == "":
            continue
            
        _url = line[30: -1]
        url.add(_url)
    
    print("There are {} usls".format(len(url)))
        
    return url

In [None]:
def scrape_sanook_data():
    file_url = "/content/drive/MyDrive/Pattern/sanook_url_colab.txt"
    urls = clean_duplicate(file_url)
    file_data = open("/content/drive/MyDrive/Pattern/dataset/sanook_data.json", "w", encoding="utf8")
    base_url = "https://www.sanook.com/health/{}/"
    
    for url in tqdm(urls):
        try:
            _res = urllib.request.urlopen(base_url.format(url))
            res = _res.read()
            res = res.decode("utf8")
            soup = BeautifulSoup(res, 'html.parser')
            _res.close()
    
            title = soup.find("h1", {"class": "jsx-2761676397 title"})
            title = title.get_text().strip()                
                              
            content = []
            soup = soup.find("div", {"class": "jsx-3647499928 jsx-3717305904"})
            _content = soup.findAll(["p", "h3", "li"])
            for _ in _content:
                if _.text.strip() == "":
                    continue
                else:
                    content.append(_.text.strip()) 

            data = {
                "Title": title,
                "Detail": content
            }

            json_string = json.dumps(data, ensure_ascii=False)
            
            file_data.write(json_string+"\n")     
        
        except Exception as e:
            print("\n")
            print(url)
            print(e)
            continue
        
    file_data.close()

In [None]:
scrape_sanook_data()

In [None]:
file_path = "/content/drive/MyDrive/Pattern/dataset/sanook_data.json"
out_path = "/content/drive/MyDrive/Pattern/dataset/sanook_dataset.json"
process_data(file_path, out_path)

In [None]:
check = pd.read_json(path_or_buf=out_path, lines=True)
check.head()

In [None]:
check.describe()

In [None]:
def scrape_sanook_url():
    base_url = "https://www.sanook.com/health/"

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome('chromedriver', options=chrome_options)
    driver.get(base_url)

    driver.find_element(by=By.XPATH , value="//button[@class='jsx-1854747484 button']").click()
    loadmore = driver.find_element(by=By.XPATH , value="//button[@class='jsx-3493116903 bg-color-health pagination typeDefault']")
    j = 0

    while loadmore.is_displayed():
        try:
            loadmore.click()
            time.sleep(0.5)
            if j%100 == 0:
                print("-- {} --".format(j))

                if j > 1000:
                    f = open("sanook_url_py.txt", "a")
                    f.write("\n")
                    html_source = driver.page_source
                    soup = BeautifulSoup(html_source, 'html.parser')
                    urls = soup.findAll("a", {"class": "jsx-1104899621 EntryListImage"})
                

                    for _url in urls:
                        url = _url["href"]
                        f.write(url.strip()+"\n")
                    f.close()  
                    print(len(urls))
                
            j += 1
            
            loadmore = driver.find_element(by=By.XPATH , value="//button[@class='jsx-3493116903 bg-color-health pagination typeDefault']")

        except Exception as e:
            print(e)
            break
    
    try:
        html_source = driver.page_source
        driver.quit()
        soup = BeautifulSoup(html_source, 'html.parser')
        
        urls = soup.findAll("a", {"class": "jsx-1104899621 EntryListImage"})
        print(len(urls))
        f = open("sanook_url_done_py.txt", "a")  
        f.write("\n")           
        for _url in urls:
            url = _url["href"]
            f.write(url.strip()+"\n")
        f.close()
    
    except:
        html_source = driver.page_source
        driver.quit()  

In [None]:
scrape_sanook_url()

## matichon

In [None]:
def scrape_matichon_url():
    base_url = "https://www.matichon.co.th/lifestyle/health-beauty"
    max_page = 98

    f = open("matichon_url.txt", "a")
    for page in tqdm(range(0, max_page)):

        if page == 0:
            _res = urllib.request.urlopen(base_url)
        else:
            _res = urllib.request.urlopen(base_url+"/page/"+str(page+1))
        res = _res.read()
        res = res.decode("utf8")
        soup = BeautifulSoup(res, 'html.parser')
        _res.close()
        
        try:
            _ = soup.find("div", {"class": "td-pb-span8 td-main-content"})
            blogs = _.findAll("div", {"class": "td-module-thumb"})
                        
            for blog in blogs:
                url = blog.find("a", {"class": "ud-module-link"})["href"]
                f.write(url.strip()+"\n")
        
        except Exception as e:
            print(e)
            print(page+1)
            continue

    f.close()

In [None]:
scrape_matichon_url()

In [None]:
def scrape_matichon_data():
    file_url = open("/home/natthanon/pattern_project/matichon_url.txt", "r")
    file_data = open("/home/natthanon/pattern_project/matichon_data.json", "w", encoding="utf8")
    for url in tqdm(file_url):
        try:
            _res = urllib.request.urlopen(url)
            res = _res.read()
            res = res.decode("utf8")
            soup = BeautifulSoup(res, 'html.parser')
            _res.close()
    
            title = soup.find("h1", {"class": "entry-title"})
            title = title.get_text().strip()    
            
                              
            content = []
            soup = soup.find("div", {"itemprop": "articleBody"})
            _content = soup.findAll(["p"])
            for _ in _content:
                if _.findAll(["a"]):
                    continue
                elif _.text.strip() == "":
                    continue
                else:
                    content.append(_.text.strip()) 

            data = {
                "Title": title,
                "Detail": content
            }

            json_string = json.dumps(data, ensure_ascii=False)
            
            file_data.write(json_string+"\n")     
        
        except Exception as e:
            print(e)
            continue
        
    file_data.close()
    file_url.close()

## bbc

In [None]:
def scrape_bbc_url():
    base_url = "https://www.bbc.com/thai/topics/cyx5kz25zxdt/page/"
    _url = "https://www.bbc.com"
    max_page = 100

    f = open("bbc_url.txt", "a")
    for page in tqdm(range(max_page)):
        res = requests.get(base_url+str(page+1))
        soup = BeautifulSoup(res.text, 'html.parser')
        
        try:
            blogs = soup.findAll("a", {"class": "qa-story-cta-link"})
            for blog in blogs:
                url = _url+blog["href"]
                f.write(url+"\n")
            
        
        except Exception as e:
            print(e)
            continue
    f.close()

In [None]:
scrape_bbc_url()

In [None]:
def scrape_bbc_data():
    file_url = open("/content/bbc_url.txt", "r")
    file_data = open("/content/bbc_data.json", "w", encoding="utf8")
    for url in tqdm(file_url):
        try:
            res = requests.get(url.strip())
            soup = BeautifulSoup(res.text, "html.parser")
            soup = soup.find("main", {"role": "main"})

            title = soup.find("h1", {"id": "content"})
            title = title.get_text().strip()
            print(title)
            

            content = []
            _content = soup.findAll(["p", "h2", "li"])
            for _ in _content:
                if _.findAll(["span", "a"]):
                    continue
                elif _.text.strip() == "":
                    continue
                else:
                    content.append(_.text.strip()) 

            data = {
                "title": title,
                "content": content
            }

            json_string = json.dumps(data, ensure_ascii=False)
            file_data.write(json_string+"\n")     
        
        except Exception as e:
            print(e)
        
    file_data.close()
    file_url.close()

In [None]:
scrape_bbc_data()

## pptvhd36

In [None]:
def scrape_pptvhd36_url():
    base_url = "https://www.pptvhd36.com/news/%E0%B8%AA%E0%B8%B8%E0%B8%82%E0%B8%A0%E0%B8%B2%E0%B8%9E?page="
    max_page = 141

    f = open("pptvhd36_url.txt", "a")
    for page in tqdm(range(max_page)):
        res = requests.get(base_url+str(page+1))
        soup = BeautifulSoup(res.text, 'html.parser')
        
        _ = soup.findAll("div", {"class": "pptv-grid"})
        blogs = _[1].findAll("div", {"class": "pptv-col-3@m pptv-col-6@s"})
        try:
            for blog in blogs:
                url = blog.find("a", {"class": "content-item__thumb"})["href"]
                f.write(url+"\n")
        except Exception as e:
            print(e)
            continue

    f.close()

In [None]:
# scrape_pptvhd36_url()

In [None]:
def scrape_pptvhd36_data():
    file_url = open("/content/drive/MyDrive/Pattern/pptvhd36_url.txt", "r")
    file_data = open("/content/drive/MyDrive/Pattern/pptvhd36_data.json", "w", encoding="utf8")
    for url in tqdm(file_url):
        try:
            res = requests.get(url)
            soup = BeautifulSoup(res.text, "lxml")

            title = soup.find("h1", {"class": "section--head-line__title [ heading --large@m --small@s --tiny color-black bold ]"})
            title = title.get_text().strip()

            subcontent = soup.find("div", {"class": "content-details__body"})
            subcontent = subcontent.find("section", {"class": "content-details__section section section--excerpt content-container color-black"})
            subcontent = subcontent.p.get_text().strip()

            content = []
            _content = soup.find(id="content-section")
            _content = _content.findAll(["p", "li"])
            for _ in _content[:-1]:
                if _.findAll(["a", "section"]):
                    continue
                elif _.text.strip() == "":
                    continue
                else:
                    content.append(_.text.strip()) 

            data = {
                "title": title,
                "subcontent": subcontent,
                "content": content
            }

            json_string = json.dumps(data, ensure_ascii=False)
            file_data.write(json_string+"\n")   

        except Exception as e:
            print(e)
    
    return out
        
    file_data.close()
    file_url.close()

In [None]:
scrape_pptvhd36_data()

In [None]:
def process_pptvhd36_data(file_path, out_path):
    df = pd.read_json(path_or_buf=file_path, lines=True)
    
    df["Title"] = df["title"].apply(preprocess_sentence)
    df["subcontent"] = df["subcontent"].apply(preprocess_sentence)
    df["content"] = df["content"].apply(preprocess_list)

    df["Detail"] = df["subcontent"].add(df["content"])
    df["Document Tag"] = "Fact News"
    df.drop(["title", "subcontent", "content"], axis=1, inplace=True)

    df['json'] = df.apply(lambda x: x.to_json(), axis=1)
    
    file_data = open(out_path, "w", encoding="utf8")

    for index, row in df.iterrows():
        file_data.write(row["json"]+"\n")  

    file_data.close()