# Scraping TOEIC 600 words data

In [26]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
from pprint import pprint
import os

## Finding all page urls

In [2]:
domain = 'https://tienganhtflat.com'
toeic_cat = 'https://tienganhtflat.com/blog/cat/tu-vung-toeic'

res = requests.get(toeic_cat)
soup = BeautifulSoup(res.text, parser='html.parser')
page_urls = soup.findAll('a', attrs={'data-page': True})

# Get links > remove duplicated links > sort to asc
page_urls = sorted(list(set(f"{domain}{page.get('href')}" for page in page_urls)))

In [3]:
pprint(page_urls)

['https://tienganhtflat.com/blog/cat/tu-vung-toeic?page=1&per-page=18',
 'https://tienganhtflat.com/blog/cat/tu-vung-toeic?page=2&per-page=18',
 'https://tienganhtflat.com/blog/cat/tu-vung-toeic?page=3&per-page=18']


## Finding all topic urls

In [4]:
def get_topic_urls(page_url):
    res = requests.get(page_url)
    soup = BeautifulSoup(res.text, parser='html.parser')
    topics = soup.findAll('a', {'href': re.compile('^/blog/toeic-words')})
    topics = [f"{domain}{topic.get('href')}" for topic in topics]
    return topics

topic_urls = []
for page_url in page_urls:
    page_topic_urls = get_topic_urls(page_url)
    topic_urls.append(page_topic_urls)
    
topic_urls = list(set(sum(topic_urls, [])))

In [5]:
len(topic_urls)

50

In [6]:
pprint(topic_urls)

['https://tienganhtflat.com/blog/toeic-words-computers-and-the-internet',
 'https://tienganhtflat.com/blog/toeic-words-marketing',
 'https://tienganhtflat.com/blog/toeic-words-accounting',
 'https://tienganhtflat.com/blog/toeic-words-doctor-s-office',
 'https://tienganhtflat.com/blog/toeic-words-investment',
 'https://tienganhtflat.com/blog/toeic-words-quality-control',
 'https://tienganhtflat.com/blog/toeic-words-shopping',
 'https://tienganhtflat.com/blog/toeic-words-health',
 'https://tienganhtflat.com/blog/toeic-words-warranties',
 'https://tienganhtflat.com/blog/toeic-words-renting-and-leasing',
 'https://tienganhtflat.com/blog/toeic-words-eating-out',
 'https://tienganhtflat.com/blog/toeic-words-promotions-pensions-award',
 'https://tienganhtflat.com/blog/toeic-words-apply-and-interviewing',
 'https://tienganhtflat.com/blog/toeic-words-product-development',
 'https://tienganhtflat.com/blog/toeic-words-salaries-benefits',
 'https://tienganhtflat.com/blog/toeic-words-dentist-s-offi

## Playing around a topic

In [7]:
# Extract word list for a topic url
# res = requests.get(topic_urls[0])
res = requests.get('https://tienganhtflat.com/blog/toeic-words-quality-control')
soup = BeautifulSoup(res.text, parser='html.parser')
words_list = soup.findAll('div', attrs={'class': 'col-md-12 item-content clearfix'})

In [8]:
len(words_list)

12

In [9]:
# Regex pattern for a word
pronounce_pattern = re.compile('/(.*)/')
type_pattern = re.compile('\((.*)\)')
ex_pattern = re.compile('<b>Ex:</b>(.*)\n?<br/><i>')
explain_pattern = re.compile('<b>Giải thích:</b>(.*)<br/><b>Ex:</b>')

In [51]:
def safe_regex_extract(pattern, string, group):
    match = re.search(pattern, string)
    if match:
        return match.group(group).strip()
    else:
        return np.nan

In [52]:
def extract_word(word):
    p_1 = word.find('p').text
    p_2 = word.findAll('p')[-1].__str__()
    
    # Extract with soup
    image_url = word.find('img').get('src')
    english = word.find('b').text
    vietnamese = str(word.find('i').text).capitalize()
    mp3_url = word.find('a', attrs={'href': re.compile('\.mp3$')}).get('href')
    example_vietnamese = word.findAll('p')[-1].find('i').text
    
    # Extract with regex
    example = safe_regex_extract(ex_pattern, p_2, 1)
    type = safe_regex_extract(type_pattern, p_1, 1)
    pronounce = safe_regex_extract(pronounce_pattern, p_1, 0)
    explain = safe_regex_extract(explain_pattern, p_2, 1)
    
    data = {
        'english': english,
        'type': type,
        'vietnamese': vietnamese,
        'pronounce': pronounce,
        'explain': explain,
        'example': example,
        'example_vietnamese': example_vietnamese,
        'image_url': image_url,
        'mp3_url': mp3_url,
    }
    
    return data

In [62]:
word = extract_word(words_list[0])

In [63]:
pprint(word, sort_dicts=False)

{'english': 'hospital',
 'type': 'n.',
 'vietnamese': 'Bệnh viện',
 'pronounce': '/ˈhɒspɪt(ə)l/',
 'explain': nan,
 'example': 'This hospital is very large.',
 'example_vietnamese': 'Bệnh viện này là rất lớn.',
 'image_url': 'https://audio.tflat.vn/data/cache/images/300x225/h/o/hospitals1.jpg',
 'mp3_url': 'https://audio.tflat.vn/audio/h/o/hospital.mp3'}


## Scraping in bulk

In [13]:
word_data = []
total_topics = len(topic_urls)
for index, topic_url in enumerate(topic_urls):
    res = requests.get(topic_url)
    soup = BeautifulSoup(res.text, parser='html.parser')
    words_list = soup.findAll('div', attrs={'class': 'col-md-12 item-content clearfix'})
    topic_name = soup.find('h1').text
    
    for word in words_list:
        data = extract_word(word)
        data['topic'] = topic_name
        data['topic_url'] = topic_url
        word_data.append(data)
        
    print(f"[{index+1}/{total_topics}] {topic_name}")

[1/50] TOEIC WORDS - Computers and the Internet
[2/50] TOEIC WORDS - Marketing
[3/50] TOEIC WORDS - Accounting
[4/50] TOEIC WORDS - Doctor's Office
[5/50] TOEIC WORDS - Investment
[6/50] TOEIC WORDS - Quality Control
[7/50] TOEIC WORDS - Shopping
[8/50] TOEIC WORDS - Health
[9/50] TOEIC WORDS - Warranties
[10/50] TOEIC WORDS - Renting and Leasing
[11/50] TOEIC WORDS - Eating Out
[12/50] TOEIC WORDS - Promotions, Pensions & Award
[13/50] TOEIC WORDS - Apply and Interviewing
[14/50] TOEIC WORDS - Product Development
[15/50] TOEIC WORDS - Salaries & Benefits
[16/50] TOEIC WORDS - Dentist's Office
[17/50] TOEIC WORDS - Media
[18/50] TOEIC WORDS - Music
[19/50] TOEIC WORDS - Conference
[20/50] TOEIC WORDS - Trains
[21/50] TOEIC WORDS - Ordering Lunch
[22/50] TOEIC WORDS - Contracts
[23/50] TOEIC WORDS - Office Technology
[24/50] TOEIC WORDS - Theater
[25/50] TOEIC WORDS - Office Procedures
[26/50] TOEIC WORDS - Pharmacy
[27/50] TOEIC WORDS - Selecting A Restaurant
[28/50] TOEIC WORDS - Cook

In [14]:
df = pd.DataFrame(word_data)

In [15]:
df.shape[0]

615

In [16]:
df.head(30)

Unnamed: 0,english,type,vietnamese,pronounce,explain,example,example_vietnamese,image_url,mp3_url,topic,topic_url
0,access,n.,"Quyền truy cập, sự tiếp cận",/ˈækses/,The opportunity or right to use something,I cannot easily get access to the Internet.,Tôi không thể dễ dàng có được quyền truy cập v...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/a/c/access.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
1,allocate,v.,"Cấp cho, phân phối",/ˈæləkeɪt/,To give something officially to somebody / som...,The office manager did not allocate enough mon...,Người quản lý văn phòng không cấp đủ tiền để m...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/a/l/allocate.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
2,compatible,adj.,"Tương thích, hợp nhau",/kəm'pætəbl/,Able to be used together,Because my girlfriend and I listen to the same...,Vì tôi và bạn gái tôi cùng nghe nhạc của những...,https://audio.tflat.vn/data/images_example/300...,https://audio.tflat.vn/audio/c/o/compatible.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
3,delete,v.,"Xóa đi, bỏ đi",/di'li:t/,To remove something that has been written or p...,The technicians deleted all the data on the di...,Kỹ thuật viên đã vô tình xóa mọi dữ liệu trên ...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/d/e/delete.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
4,display,v.,"Hiển thị, trình bày",/dis'plei/,To put something in a place where people can s...,The accounting program displays a current bala...,Chương trình kế toán hiển thị một con số cân b...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/d/i/display.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
5,duplicate,v.,"Sao lại, nhân đôi",/'dju:plikit/,To make an extract copy of something,I think the new word processing program will d...,Tôi nghĩ rằng chương trình xử lý văn bản mới s...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/d/u/duplicate.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
6,failure,n.,"Trượt, thất bại",/'feiljə/,Lack of success in doing or achieving something,The success or failure of the plan depends on ...,Kế hoạch thành công hay thất bại là phụ thuộc ...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/f/a/failure.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
7,figure out,v.,"Đoán ra, giải ra",/ˈfɪɡə/,To calculate an amount or the cost,"By examining all of the errors, the technician...","Bằng cách xem xét mọi lỗi, các kỹ thuật viên đ...",https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/f/i/figure_out.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
8,ignore,v.,"Bỏ qua, phớt lờ",/ig'nɔ:/,To pay no attention to something,He ignored all the things she said.,Anh ấy phớt lờ tất cả những gì cô ấy nói.,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/i/g/ignore.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...
9,search,n.,"Cuộc tìm kiếm, thăm dò",/sə:tʃ/,"An attemp to find somebody / something, especi...",Our search of the database produced very littl...,Việc tìm kiếm trong cơ sở dữ liệu của chúng tô...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/s/e/search.mp3,TOEIC WORDS - Computers and the Internet,https://tienganhtflat.com/blog/toeic-words-com...


In [17]:
df['topic'] = df['topic'].str.replace('TOEIC WORDS - ', '')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   english             615 non-null    object
 1   type                612 non-null    object
 2   vietnamese          615 non-null    object
 3   pronounce           610 non-null    object
 4   explain             586 non-null    object
 5   example             615 non-null    object
 6   example_vietnamese  615 non-null    object
 7   image_url           615 non-null    object
 8   mp3_url             615 non-null    object
 9   topic               615 non-null    object
 10  topic_url           615 non-null    object
dtypes: object(11)
memory usage: 53.0+ KB


In [64]:
df.to_excel('data/toeic_600_words.xlsx', index=False)
df.to_csv('data/toeic_600_words.csv', index=False, encoding='utf-8-sig')

## Scraping media

In [34]:
# Preparing folders
media_folder = "media"
images_folder = "media/images"
audio_folder = "media/audio"

os.makedirs(media_folder, exist_ok=True)
os.makedirs(images_folder, exist_ok=True)
os.makedirs(audio_folder, exist_ok=True)

for topic in df['topic'].unique():
    topic_image_folder = f"{images_folder}/{topic}"
    topic_audio_folder = f"{audio_folder}/{topic}"
    os.makedirs(topic_image_folder, exist_ok=True)
    os.makedirs(topic_audio_folder, exist_ok=True)

In [30]:
def download_file(url, folder):
    file_name = url.split('/')[-1]
    res = requests.get(url, stream=True)
    if res.status_code == 200:
        with open(f"{folder}/{file_name}", "wb") as f:
            for chunk in res.iter_content(chunk_size=1024): # 1KB
                f.write(chunk)
        print(f"[Downloaded] {folder}/{file_name}")
    else:
        print(f"[Error] {url}")

In [37]:
for index, row in df.iterrows():
    topic_image_folder = f"{images_folder}/{row.topic}"
    topic_audio_folder = f"{audio_folder}/{row.topic}"
    download_file(row.image_url, topic_image_folder)
    download_file(row.mp3_url, topic_audio_folder)

[Downloaded] media/images/Computers and the Internet/access1.jpg
[Downloaded] media/audio/Computers and the Internet/access.mp3
[Downloaded] media/images/Computers and the Internet/allocate1.png
[Downloaded] media/audio/Computers and the Internet/allocate.mp3
[Downloaded] media/images/Computers and the Internet/because_my_ex1_56248d157f8b9a040dc65e00.jpg
[Downloaded] media/audio/Computers and the Internet/compatible.mp3
[Downloaded] media/images/Computers and the Internet/delete2.jpg
[Downloaded] media/audio/Computers and the Internet/delete.mp3
[Downloaded] media/images/Computers and the Internet/display1.jpg
[Downloaded] media/audio/Computers and the Internet/display.mp3
[Downloaded] media/images/Computers and the Internet/duplicate2.jpg
[Downloaded] media/audio/Computers and the Internet/duplicate.mp3
[Downloaded] media/images/Computers and the Internet/failure1.png
[Downloaded] media/audio/Computers and the Internet/failure.mp3
[Downloaded] media/images/Computers and the Internet/f

## Doing some statistic

In [48]:
df['type'].value_counts(normalize=True) * 100

type
n.         41.666667
v.         37.418301
adj.       13.235294
adv.        6.372549
n,v.        0.326797
perp.       0.163399
v, n.       0.163399
phr.v.      0.163399
phr. v.     0.163399
n, v.       0.163399
n.ph.       0.163399
Name: proportion, dtype: float64

In [41]:
topic_word_count = df['topic'].value_counts()

In [49]:
topic_word_count.describe()

count    50.00000
mean     12.30000
std       0.46291
min      12.00000
25%      12.00000
50%      12.00000
75%      13.00000
max      13.00000
Name: count, dtype: float64