In [8]:
import re
import json
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd


In [9]:
class testKoreaboo:

    def __init__(self):
        self.data = {}
        self.data['web'] = {}

    
    def remove_unuse_tag(self, bs : BeautifulSoup):
        unuse_tag = ['script', 'style ', 'noscript', 'head', 'footer', 'iframe']
        for tag in unuse_tag:
            for s in bs.select(tag):
                if s != None:
                    s.extract()
        return bs

    def clean_html(self, html : str):
        clean = re.compile('<.*?>')
        clean_text = re.sub(clean, '', html)
        for char in ['\n', '\t', '\r']:
            clean_text = clean_text.replace(char, '')
        clean_text = re.sub(' +', ' ', clean_text)
        return clean_text

    def data_in_link(self, session, url):
        self.list_url = []
        self.list_title = []
        self.list_content = []

        with session.get(url) as response:
            # print('check re 2', response)
            if not response.ok:
                return
            print('test url', url)
            
            domain = urlparse(url).netloc
            bs = BeautifulSoup(response.text, 'html.parser')
            bs = self.remove_unuse_tag(bs)
            # print('test bs inlink', bs)
            
            self.data['web'][url] = {}
            self.data['web'][url]['title'] = bs.find('h1').text

            self.content = ''
            section = bs.find('div', {'class' : 'entry-content'})
            for p in section.find_all('p'):
                self.content += f'{self.clean_html(p.text)} ' #p.text
            self.data['web'][url]['content'] = self.content
            # print('test content inlink', self.content)



    def data_domain(self, session, url):
        with session.get(url) as response:
            # print('check re 1', response)
            if not response.ok:
                return
            domain = urlparse(url).netloc
            bs = BeautifulSoup(response.text, 'html.parser')
            bs = self.remove_unuse_tag(bs)
            # print('test bs domain', bs)

            news_link = []
            for section in bs.find_all('section', {'class' : 'kbos-page'}):
                a = section.find('a')
                news_link.append(a.attrs['href'])
            # print('test news_link domain', news_link)

            n = len(news_link)
            # print('test n news_link domain', n)
            with ThreadPoolExecutor(max_workers=n) as executor:
                with requests.Session() as session:
                    executor.map(self.data_in_link, [session]*n, news_link)
                    executor.shutdown(wait=True)


    def scrap(self):
        pages_list = [f'https://www.koreaboo.com/news/page/{i}/' for i in range(1, 11)]
        # for i in range(1, 11):
        #     print('test i', i)
        # print('test page', pages_list)
        n = len(pages_list)
        with ThreadPoolExecutor(max_workers=n) as executor:
            with requests.Session() as session:
                executor.map(self.data_domain, [session]*n, pages_list)
                executor.shutdown(wait=True)


    def save_to_json(self):
        with open(f'Koreaboo.json', 'w', encoding="UTF-8") as file:
            datafile = json.dumps(self.data, indent=4) 
            file.write(datafile)


    # def dataframe_web(self):

    #     data_web = [[self.list_url
    #                     ]]

    #     web_frame = pd.DataFrame(data=data_web,
    #                         columns=['link'])
    #     return web_frame

    # def twitter_to_xlsx(self, data):
    #     data.to_excel("hey_test.xlsx", engine="openpyxl", index=False)


In [10]:
data = testKoreaboo()
data

<__main__.testKoreaboo at 0xf691598>

In [11]:
data.scrap()

test url https://www.koreaboo.com/news/2ne1-cl-dara-sandara-park-coachella-2022-sexy-funny/
test url https://www.koreaboo.com/news/ateez-surpasses-1-billion-streams-spotify/
test url https://www.koreaboo.com/news/twice-sana-chaeyoung-flirting-date-cute-moments/
test url test url https://www.koreaboo.com/news/bts-jin-chris-martin-coldplay-guitar-friendship/
https://www.koreaboo.com/news/aespa-winter-fan-harry-styles-live-performance-coachella/
test urltest urltest url   https://www.koreaboo.com/news/shin-dong-yup-confesses-once-casted-in-adult-film/https://www.koreaboo.com/news/bts-v-instagram-profile-photo-comeback-hint/
https://www.koreaboo.com/news/got7-bambam-confirm-full-group-comeback-twitter/

test url https://www.koreaboo.com/news/got7-maknae-line-bambam-yugyeom-leader-jay-b-dance-practice-comeback/
test url https://www.koreaboo.com/news/aespa-unreleased-english-song-lifes-too-short-coachella-debut/


In [12]:
# data.dataframe_web()

In [13]:
data.save_to_json()

In [14]:
# hey = data.dataframe_web()
# data.twitter_to_xlsx(hey)