In [1]:
import requests
from bs4 import BeautifulSoup
import pycantonese
import re

# Data Soruce: https://cantonese.ca/ 

corpus = pycantonese.hkcancor()

url = "https://cantonese.ca/"
main_html = requests.get(url).text
main_soup = BeautifulSoup(main_html, "lxml")


In [2]:
class Group:
    def __init__(self, name, link):
        self.name = name
        self.link = link
        self.html = requests.get(url+self.link).text
        self.soup = BeautifulSoup(self.html, "lxml")
        self.word_dict = {}
    # add a to dic method https://stackoverflow.com/questions/34997174/how-to-convert-list-of-model-objects-to-pandas-dataframe/41762270 

In [3]:
groups = []

for link in main_soup.find_all('a'):
    if re.search(r"[\u4e00-\u9fff]", link.decode_contents()) or re.search(r"[^a-zA-Z0-9\s]", link.decode_contents()):
        continue
    groups.append(Group(link.decode_contents(), link.get("href")))

del groups[0:4]
del groups[-2:]

In [4]:
del_gps = ["Comparisons and similes", "Romanization conversion", "Time", "Zodiac", "Idioms and Metapthors", "Links", "Smoking", "Measure words", "Prefixes and suffixes", "Slang", "Swearing"]


for gp in groups:

    # remove unwanted groups
    if gp.name in del_gps:
        groups.remove(gp)
        continue

    gp.word_dict["English"] = []
    gp.word_dict["Cantonese"] = []

    for l in gp.soup.find_all('tr'):
        td = l.findChildren("td")[:-1]

        
        # for Colours --> special order
        if gp.name == "Colours":
            if len(td) < 2:
                continue
            if td[3].get_text() == "white":
                gp.word_dict["English"].append(td[3].get_text())
                gp.word_dict["Cantonese"].append(td[4].get_text())
            else:
                gp.word_dict["English"].append(td[2].get_text())
                gp.word_dict["Cantonese"].append(td[3].get_text())
        else:
            # wrong length or no eng and ch
            if len(td) < 2 or (len(td[0]) == 0 and len(td[1]) == 0):
                continue
            
            gp.word_dict["English"].append(td[0].get_text())
            if not(re.search(r"[\u4e00-\u9fff]", td[1].get_text())):
                gp.word_dict["Cantonese"].append('')
            else: 
                gp.word_dict["Cantonese"].append(td[1].get_text())

"""
for a in groups:
    print(a.name)
    print(a.word_dict)
    print(len(a.word_dict["Eng"]), len(a.word_dict["Ch"]))
    print()
"""

'\nfor a in groups:\n    print(a.name)\n    print(a.word_dict)\n    print(len(a.word_dict["Eng"]), len(a.word_dict["Ch"]))\n    print()\n'

In [5]:
def fix(string):
    temp = ''.join([c for c in string if not(str(c.encode('utf-8')) == "b' '")])
    ''.join(temp)

    if re.match(r'.+\n.+', temp):
        temp = re.sub(r'\n', r'/', temp)
    if re.match(r'.+/$', temp) or re.match(r'.+\n$', temp) :
        temp = temp[:-1]
    if re.match(r'\xa0', temp):
        temp = re.sub(r'\xa0', '', temp)
    return temp

for gp in groups:
    if not(bool(gp.word_dict)):#len(gp.word_dict['English']) == 0 and len(gp.word_dict['Cantonese']) == 0:
        groups.remove(gp)
        continue

    #if len(gp.word_dict['English']) < 2:
        #continue
  
    del_idx = []
    for i in range(len(gp.word_dict['English'])):
        if gp.word_dict['English'][i] == '' and gp.word_dict['Cantonese'][i] == '':
            del_idx.append(i)
            continue
        
        #gp.word_dict['English'][i] = fix(gp.word_dict['English'][i])
        if re.match(r'\xa0', gp.word_dict['English'][i]):
            gp.word_dict['English'][i] = re.sub(r'\xa0', '', gp.word_dict['English'][i])

        gp.word_dict['Cantonese'][i] = fix(gp.word_dict['Cantonese'][i])
    
    if len(del_idx) > 0:
        for i in del_idx:
            del gp.word_dict['English'][i]
            del gp.word_dict['Cantonese'][i]



In [6]:
import pandas as pd
import numpy as np

col_name = ['Group', 'English', 'Cantonese']
df = pd.DataFrame(columns=col_name)

for gp in groups:
    df_temp = pd.DataFrame.from_dict(gp.word_dict)
    df_temp['Group'] = gp.name
    col = df_temp.pop("Group")
    df_temp.insert(0, col.name, col)
    df = df.append(df_temp, ignore_index=True)

df = df.replace(r'^\s*$', np.nan, regex=True)
df


Unnamed: 0,Group,English,Cantonese
0,Common phrases,what's up?,點呀
1,Common phrases,thank you!,唔該/多謝
2,Common phrases,you're welcome,唔哂（客氣）
3,Common phrases,how are you?,你好嗎
4,Common phrases,have you eaten yet?,你食咗飯未呀？
...,...,...,...
1411,Verbs of perception,squint,
1412,Verbs of perception,stare at,
1413,Verbs of perception,taste,
1414,Verbs of perception,watch,


In [7]:
empty = np.where(pd.isnull(df))



en_yue = {'en':[], 'yue': [] ,'row': []}
yue_en = {'yue':[], 'en': [] ,'row': []}
for i in range(len(empty[0])):
    if empty[1][i] == 1:
        yue_en['row'].append(empty[0][i])
        yue_en['yue'].append(df[col_name[2]][empty[0][i]])
    else: 
        en_yue['row'].append(empty[0][i])
        en_yue['en'].append(df[col_name[1]][empty[0][i]])

In [8]:
from translator import translate

en_yue['yue'] = translate(en_yue['en'], 'en', ['yue'])
yue_en['en'] = translate(yue_en['yue'], 'yue', ['en'])

In [15]:
for i in range(len(en_yue['en'])):
    df[col_name[2]][en_yue['row'][i]] = en_yue['yue'][i][0]

for i in range(len(yue_en['yue'])):
    df[col_name[1]][yue_en['row'][i]] = yue_en['en'][i][0]

df = df.dropna()
df.to_csv('yue_en.csv', encoding='utf-8')
df

Unnamed: 0,Group,English,Cantonese
0,Common phrases,what's up?,點呀
1,Common phrases,thank you!,唔該/多謝
2,Common phrases,you're welcome,唔哂（客氣）
3,Common phrases,how are you?,你好嗎
4,Common phrases,have you eaten yet?,你食咗飯未呀？
...,...,...,...
1411,Verbs of perception,squint,斜視
1412,Verbs of perception,stare at,望住
1413,Verbs of perception,taste,味道
1414,Verbs of perception,watch,看
