1. BeautifulSoup을 사용하여 'https://www.tensorflow.org' 해당페이지에서 링크로 타고 들어갈수 있는 모든 링크데이터 수집
2. 데이터 매핑
3. 번역 모델에 문장 넣기

In [5]:
!pip install requests
!pip install bs4

Collecting bs4
  Obtaining dependency information for bs4 from https://files.pythonhosted.org/packages/51/bb/bf7aab772a159614954d84aa832c129624ba6c32faa559dfb200a534e50b/bs4-0.0.2-py2.py3-none-any.whl.metadata
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [240]:
# BeautifulSoup을 사용하여 'https://www.tensorflow.org' 해당페이지에서 링크로 타고 들어갈수 있는 모든 링크데이터 수집
import requests
from bs4 import BeautifulSoup

class LanguageLinksScraper:
    def __init__(self, base_url, language_code=None):
        self.base_url = base_url
        self.language_code = language_code
        self.link_list = []

    def get_links(self):
        url = self.base_url
        if self.language_code:
            url += f"?hl={self.language_code}"

        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 모든 링크 찾기
            links = soup.find_all('a')
            
            # 링크가 있는 경우에만 처리
            for link in links:
                href = link.get('href')
                if href and "https://www." in href:
                    self.link_list.append(href)
            # 중복값 제거
            self.link_list = list(dict.fromkeys(self.link_list))
           

    def get_language_links_dict(self):
        return {self.language_code: self.link_list}


# 각 언어에 대한 LanguageLinksScraper 인스턴스를 생성

language_scrapers = {
    "en": LanguageLinksScraper("https://www.tensorflow.org"),
    "es-419": LanguageLinksScraper("https://www.tensorflow.org", language_code="es-419"),
    "fr": LanguageLinksScraper("https://www.tensorflow.org", language_code="fr"),
    "id": LanguageLinksScraper("https://www.tensorflow.org", language_code="id"),
    "it": LanguageLinksScraper("https://www.tensorflow.org", language_code="it"),
    "pl": LanguageLinksScraper("https://www.tensorflow.org", language_code="pl"),
    "pt-br": LanguageLinksScraper("https://www.tensorflow.org", language_code="pt-br"),
    "vi": LanguageLinksScraper("https://www.tensorflow.org", language_code="vi"),
    "tr": LanguageLinksScraper("https://www.tensorflow.org", language_code="tr"),
    "ru": LanguageLinksScraper("https://www.tensorflow.org", language_code="ru"),
    "he": LanguageLinksScraper("https://www.tensorflow.org", language_code="he"),
    "ar": LanguageLinksScraper("https://www.tensorflow.org", language_code="ar"),
    "fa": LanguageLinksScraper("https://www.tensorflow.org", language_code="fa"),
    "hi": LanguageLinksScraper("https://www.tensorflow.org", language_code="hi"),
    "bn": LanguageLinksScraper("https://www.tensorflow.org", language_code="bn"),
    "th": LanguageLinksScraper("https://www.tensorflow.org", language_code="th"),
    "zh-cn": LanguageLinksScraper("https://www.tensorflow.org", language_code="zh-cn"),
    "zh-tw": LanguageLinksScraper("https://www.tensorflow.org", language_code="zh-tw"),
    "ja": LanguageLinksScraper("https://www.tensorflow.org", language_code="ja"),
    "ko": LanguageLinksScraper("https://www.tensorflow.org", language_code="ko")
}

# 각 언어에 대한 링크를 가져옵니다.
for language_code, scraper in language_scrapers.items():
    scraper.get_links()

# 링크를 저장할 딕셔너리를 생성합니다.
language_links_dict = {language_code: scraper.link_list for language_code, scraper in language_scrapers.items()}


In [241]:
# hl뒤에 나오는 language code 제거 로직
from urllib.parse import urlparse, parse_qs, urlencode

def remove_hl(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    query_params.pop('hl', None)
    new_query = urlencode(query_params, doseq=True)
    return parsed_url._replace(query=new_query).geturl()

In [242]:
# 'en' 키의 값을 기준으로 다른 키들의 값 중에서 hl 뒤의 값이 일치하지 않는 경우 해당 값을 제거
base_key = 'en'
base_values = set([remove_hl(url) for url in language_links_dict[base_key]])

for lang, urls in language_links_dict.items():
    if lang != base_key:
        modified_urls = [url for url in urls if remove_hl(url) in base_values]
        language_links_dict[lang] = modified_urls

In [253]:
# 유투브 제거 및 align작업을 할수 없는 링크 제거
# 아래 링크는 특정국가의 언어만 있기 때문에 제거해주었음
for lang, urls in language_links_dict.items():
    language_links_dict[lang] = [url for url in urls if "https://www.youtube.com" not in url 
                                 and url != 'https://www.tensorflow.org/?hl=en' 
                                 and 'https://www.tensorflow.org/api/stable' not in url 
                                 and 'https://www.tensorflow.org/resources/libraries-extensions' not in url ]

In [254]:
import pickle
with open(file='베링랩/scraper_link.pickle', mode='wb') as f:
    pickle.dump(language_links_dict, f)

In [255]:
len(language_links_dict['en'])

21