In [1]:
!pip install sports_news_collector

Collecting sports_news_collector
  Downloading https://files.pythonhosted.org/packages/ed/d7/aed0a1fda52594eb5f3c9c16312b3a6963140fceb4389e8af7efdbe8775e/sports_news_collector-0.1-py3-none-any.whl
Collecting pretty-html-table
  Downloading https://files.pythonhosted.org/packages/00/64/6c8ebfebfe8c07106faf42ce9b51d3f4f378be10b011a59866df5e11b4d0/pretty_html_table-0.9.dev0.tar.gz
Collecting pororo
[?25l  Downloading https://files.pythonhosted.org/packages/78/ab/f409aab13ba2a4e2576d2ea4b877396029c617d17553edbbb9ba64cf4ee9/pororo-0.4.2-py3-none-any.whl (256kB)
[K     |████████████████████████████████| 266kB 5.3MB/s 
Collecting fairseq>=0.10.2
[?25l  Downloading https://files.pythonhosted.org/packages/15/ab/92c6efb05ffdfe16fbdc9e463229d9af8c3b74dc943ed4b4857a87b223c2/fairseq-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 7.7MB/s 
[?25hCollecting nltk>=3.5
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import requests
from bs4 import BeautifulSoup

In [5]:
class NewsCrawler(object):
    def __init__(
            self,
            event: str,
            default_url: str = 'https://sports.news.naver.com/',
    ) -> None:
        self.event = event
        self.default_url = default_url

    def __call__(self) -> list:
        news_list = list()

        url_list = self._url_crawling(self.default_url, self.event)
        for url in url_list:
            news = self._news_crawling(url).strip()
            if news is not None and news[:2] != 'if' and len(news) > 150:
                news_list.append(news)

        return news_list

    def _url_crawling(self, default_url: str, event: str) -> list:
        url_list = list()

        html = requests.get(default_url + event + '/index.nhn')
        soup = BeautifulSoup(html.text, 'html.parser')

        news1 = soup.find('ul', class_='home_news_list division')
        news2 = soup.find('ul', class_='home_news_list')

        urls = news1.find_all('a')

        for url in urls:
            href = url.attrs['href']
            if href is not None:
                url_list.append(default_url + href)

        urls = news2.find_all('a')
        for url in urls:
            href = url.attrs['href']
            if href is not None:
                url_list.append(default_url + href)

        return url_list

    def _news_crawling(self, url: str) -> str:
        # 스포츠 종목과 언론사마다 틀이 달라서 한번에 예외처리 하기가 힘들다.
        START_EXCEPTION = []  # 다른 스포츠와 다르게 국내 야구에서는 기자 이름을 맨 뒤에 해서 [xxx기자]형태로 사용함.
        END_EXCEPTION = ['▶', '@', '┌']  # 국내 야구에서는 앞부분 사진 이후에 기자 이름 앞에 ⓒ 표시 있음.

        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'html.parser')

        all_news = soup.find(id='newsEndContents').text
        start_idx = 0
        end_idx = len(all_news)

        for i in range(len(all_news)):
            if all_news[i] in START_EXCEPTION:
                start_idx = i + 1
            if all_news[i] in END_EXCEPTION:
                end_idx = i
                break

        all_news = all_news[start_idx: end_idx]

        return all_news

In [4]:
from smtplib import SMTP
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import getpass

In [6]:
class EmailSender(object):
    def __init__(
            self,
            event: str,
    ) -> None:
        events = {'kbaseball': '국내야구', 'wbaseball': '해외야구', 'kfootball': '국내축구', 'wfootball': '해외축구',
                    'basketball': '농구', 'volleyball': '배구', 'golf': '골프', 'general': '일반', 'esports': 'e스포츠 & 게임'}

        self.event = events[event]
        self.subject = events[event] + ' 뉴스를 전달해드리겠습니다.'
        self.to_email = 'seomk9896@gmail.com'  # 여러 명 일 때는 리스트 형태로 표현
        self.basic_text = '오늘 하루도 화이팅 하시고, 많은 피드백 부탁드립니다 !\n\n'

    def __call__(self, body):
        message = MIMEMultipart()

        message['Subject'] = self.subject
        message['From'] = from_email_id
        message['To'] = self.to_email

        message.attach(MIMEText(self.basic_text, "plain"))
        message.attach(MIMEText(body, "html"))
        msg_body = message.as_string()

        server = SMTP('smtp.gmail.com', 587)
        server.starttls()
        server.login(message['From'], from_email_pw)
        server.sendmail(message['From'], message['To'], msg_body)

        print(self.event + ' 뉴스를 전달하였습니다.')
        server.quit()

In [7]:
from pororo import Pororo
from pandas import DataFrame
from pretty_html_table import build_table
import datetime
import pandas as pd

In [17]:
class SportsNewsCollector(object):
    def __init__(
            self,
            event: str,
            lang: str = 'ko',
            default_url: str = 'https://sports.news.naver.com/',
    ) -> None:
        self.lang = lang.lower()
        self.default_url = default_url
        self.summary = Pororo(task='summarization', model='abstractive', lang=self.lang)
        self.sentiment_analysis = Pororo(task="sentiment", model="brainbert.base.ko.nsmc", lang=self.lang)

        if event in self.available_events():
            self.event = event
        else:
            raise KeyError('Unknown events : {}, available events are {}'.format(event, self.available_events()))

    def collect(self) -> None:
        collector = NewsCrawler(self.event, self.default_url)
        news_list = collector()

        news_info = self._make_news_info(news_list)

        sender = EmailSender(self.event)
        sender(news_info)

    def _make_news_info(self, news_list: list) -> DataFrame:
        datetime_list = list()
        sentiment_list = list()
        summary_list = list()
        news_info = {'datetime': datetime_list, 'summary': summary_list, 'sentiment': sentiment_list}  

        date = str(datetime.datetime.now())
        date = date[:-7]

        for idx in range(len(news_list)):
            datetime_list.append(date)
    
            summary = self.summary(news_list[idx])
            summary_list.append(summary)
            sentiment_list.append(self.sentiment_analysis(summary))

        news_info = pd.DataFrame(news_info)
        news_info = build_table(news_info, 'orange_light')

        return news_info

    @staticmethod
    def available_events() -> list:
        return ['kbaseball', 'wbaseball', 'kfootball', 'wfootball', 'basketball', 'volleyball', 'golf', 'general', 'esports']

In [9]:
from_email_id = 'seomk9896@gmail.com'
from_email_pw = getpass.getpass(from_email_id + "'s password: ")  #  비밀번호를 계속 입력해야되는 문제 때문에 class에서 뺌.

seomk9896@gmail.com's password: ··········


In [18]:
from time import sleep
print(SportsNewsCollector.available_events())
collector = SportsNewsCollector(event='wfootball')

while(True):
    date = str(datetime.datetime.now())
    date = date[:-7]
    print(f"뉴스를 가져옵니다. 현재 시각은 ", date ,"입니다.")
    collector.collect()
    sleep(100)

['kbaseball', 'wbaseball', 'kfootball', 'wfootball', 'basketball', 'volleyball', 'golf', 'general', 'esports']
뉴스를 가져옵니다. 현재 시각은  2021-04-04 11:30:47 입니다.
해외축구 뉴스를 전달하였습니다.
뉴스를 가져옵니다. 현재 시각은  2021-04-04 11:34:47 입니다.
해외축구 뉴스를 전달하였습니다.
뉴스를 가져옵니다. 현재 시각은  2021-04-04 11:38:50 입니다.
해외축구 뉴스를 전달하였습니다.
뉴스를 가져옵니다. 현재 시각은  2021-04-04 11:42:51 입니다.
해외축구 뉴스를 전달하였습니다.


KeyboardInterrupt: ignored

In [19]:
while True:pass
# https://stackoverflow.com/questions/57113226

KeyboardInterrupt: ignored