In [1]:
import requests, re, bs4

In [2]:
class QuizletParser3:
    """
    Parser for https://quizlet.com/
    :param url: url for your study set
    """
    HEADERS = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': 'yourcookie',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 12239.92.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.136 Safari/537.36',
    }

    def __init__(self, url: str) -> None:
        self.max_length = None
        self.htmlflashcards = None
        self.transl_parser = None
        self.phrase_parser = None
        self.flashcards = []
        self.url = url
        self.dropped = []

    def parse_page_(self) -> None:
        raw_site_page = requests.get(self.url, headers=self.HEADERS)
        raw_site_page.raise_for_status()
        soup = bs4.BeautifulSoup(raw_site_page.text, 'html.parser')
        # print(soup)
        self.htmlflashcards = soup.find_all(class_='SetPageTerms-term')

    def parse_flashcard_(self, htmlcard):
        soup = bs4.BeautifulSoup(str(htmlcard), 'html.parser')
        sides = soup.find_all(class_='SetPageTerm-sideContent')
        if self.phrase_parser:
            phrase = self.phrase_parser(sides[0].text)
        else:
            phrase = sides[0].text
        if self.transl_parser:
            meaning, liter_transl = self.transl_parser(sides[1].text)
        else:
            meaning = sides[1].text
            liter_transl = self.translate_(phrase)
        iso_lang = sides[0].span['class'][-1].split('-')[-1] + '-' + sides[1].span['class'][-1].split('-')[-1]
        return phrase, meaning, iso_lang, liter_transl

    def translate_(self, phrase: str) -> str:
        return ''

    def get_flashcards(self, max_length=30, transl_parser=None, phrase_parser=None) -> None:
        self.max_length = max_length
        self.parse_page_()
        self.transl_parser = transl_parser
        self.phrase_parser = phrase_parser
        for card in self.htmlflashcards:
            phrase, meaning, iso_lang, literal_transl = self.parse_flashcard_(htmlcard=card)
            if (len(meaning) <= self.max_length) and (len(phrase) <= self.max_length):
                self.flashcards.append({'phrase': phrase,
                                    'meaning': meaning,
                                    'iso_lang': iso_lang,
                                    'literal_transl': literal_transl, })
            else:
                self.dropped.append({'phrase': phrase,
                                    'meaning': meaning,
                                    'iso_lang': iso_lang,
                                    'literal_transl': literal_transl, })
        return self.flashcards

In [3]:
import gspread
def gsheets_writer(flashcards:dict, sheetname:str):
    gc = gspread.service_account(filename='yandex-translator-stories-0bec25d5f8ae.json')
    sh = gc.open("Yandex-translator-stories")
    to_write = [[card['iso_lang'], card['phrase'], card['meaning'], card['literal_transl']] for card in flashcards]
    sh.worksheet(sheetname).append_rows(to_write)

In [4]:
URLS = ['https://quizlet.com/kr/522520082/korean-idioms-flash-cards/',
       'https://quizlet.com/188110920/everyday-korean-idioms-flash-cards/',
       'https://quizlet.com/557032830/korean-idioms-flash-cards/',
       'https://quizlet.com/610629931/korean-idioms-flash-cards/', 
       'https://quizlet.com/674111879/korean-idioms-flash-cards/', 
       'https://quizlet.com/530092028/korean-idioms-flash-cards/', 
       'https://quizlet.com/30691219/korean-idioms-flash-cards/',
       'https://quizlet.com/ru/714305857/korean-idioms-flash-cards/',
       ]

In [5]:
sajasangeo = ['https://quizlet.com/gb/524126993/%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/',
             'https://quizlet.com/44351272/%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/',
             'https://quizlet.com/kr/555960251/atomy-%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/',
             'https://quizlet.com/24559972/topik-advanced-%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/',
             ]

In [6]:
sajasangeo_brackets = ['https://quizlet.com/613767034/%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/',
                      'https://quizlet.com/604076855/%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/', 
                      'https://quizlet.com/643381448/%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/', 
                      'https://quizlet.com/583247963/%EC%82%AC%EC%9E%90%EC%84%B1%EC%96%B4-flash-cards/',
                      ]

In [20]:
for url in URLS:
    qp = QuizletParser3(url)
    gsheets_writer(qp.get_flashcards(), 'ko-en')
    gsheets_writer(qp.dropped, 'dropped')

In [7]:
def omit_brackets(text:str) -> str:
    regexp = r'([^()]*)(\(.*?\))?'
    m = re.match(regexp, text)
    return m.group(1)

In [8]:
def character_meaning(text:str) -> tuple:
    splitted = text.split('.')
    literal = splitted[0].strip()
    figur = splitted[1].strip()
    return figur, literal

In [28]:
for url in sajasangeo:
    qp = QuizletParser3(url)
    gsheets_writer(qp.get_flashcards(), 'ko-en')
    gsheets_writer(qp.dropped, 'dropped')

In [29]:
for url in sajasangeo_brackets:
    qp = QuizletParser3(url)
    gsheets_writer(qp.get_flashcards(phrase_parser=omit_brackets), 'ko-en')
    gsheets_writer(qp.dropped, 'dropped')

In [8]:
ru = ['https://quizlet.com/kr/234657270/Корейские-идиомы-flash-cards/',
     'https://quizlet.com/118205116/Корейские-идиомы-flash-cards/',
     'https://quizlet.com/265701684/korean-idioms-100-part-1-1-33-flash-cards/',
     'https://quizlet.com/ru/542201869/%D0%9A%D0%BE%D1%80%D0%B5%D0%B9%D1%81%D0%BA%D0%B8%D0%B5-%D1%84%D1%80%D0%B0%D0%B7%D0%B5%D0%BE%D0%BB%D0%BE%D0%B3%D0%B8%D0%B7%D0%BC%D1%8B-%D0%B8-%D0%BF%D0%BE%D1%81%D0%BB%D0%BE%D0%B2%D0%B8%D1%86%D1%8B-flash-cards/',
     ]

In [9]:
for url in ru:
    qp = QuizletParser3(url)
    gsheets_writer(qp.get_flashcards(), 'ko-ru')
    gsheets_writer(qp.dropped, 'dropped')

In [9]:
missed_url = 'https://quizlet.com/ru/714305857/korean-idioms-flash-cards/'
qp = QuizletParser3(missed_url)
gsheets_writer(qp.get_flashcards(), 'ko-en')
gsheets_writer(qp.dropped, 'dropped')