# TOEIC 600 Words Scraped Dataset

Learning English can be done in many ways, and not everyone wants to rely solely on language learning apps. Some prefer to access vocabulary datasets and integrate them into their own tools, such as Anki.

To support those looking for a structured TOEIC 600 words dataset and to enhance my data scraping skills, I created this project.

## Finding all page URLs

In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
from pprint import pprint
import os

In [2]:
domain = 'https://tienganhtflat.com'

# Finding all page URLs
toeic_cat = 'https://tienganhtflat.com/blog/cat/tu-vung-toeic'
res = requests.get(toeic_cat)
soup = BeautifulSoup(res.text, parser='html.parser')
page_urls = soup.findAll('a', attrs={'data-page': True}) # Page link has attribute "data-page" in <a> tag

# Get links > Remove duplicated links > Sort links
page_urls = sorted(list(set(f"{domain}{page.get('href')}" for page in page_urls)))

In [3]:
pprint(page_urls)

['https://tienganhtflat.com/blog/cat/tu-vung-toeic?page=1&per-page=18',
 'https://tienganhtflat.com/blog/cat/tu-vung-toeic?page=2&per-page=18',
 'https://tienganhtflat.com/blog/cat/tu-vung-toeic?page=3&per-page=18']


## Finding all topic urls

In [4]:
def get_topic_urls(page_url):
    res = requests.get(page_url)
    soup = BeautifulSoup(res.text, parser='html.parser')
    topics = soup.findAll('a', {'href': re.compile('^/blog/toeic-words')}) # Topic link has href start with "/blog/toeic-words"
    topics = [f"{domain}{topic.get('href')}" for topic in topics]
    return topics

topic_urls = []
for page_url in page_urls:
    page_topic_urls = get_topic_urls(page_url)
    topic_urls.append(page_topic_urls)

# Remove duplicated topic links
topic_urls = list(set(sum(topic_urls, [])))

In [5]:
print('Total topics:', len(topic_urls))

Total topics: 50


In [6]:
pprint(topic_urls)

['https://tienganhtflat.com/blog/toeic-words-inventory',
 'https://tienganhtflat.com/blog/toeic-words-theater',
 'https://tienganhtflat.com/blog/toeic-words-dentist-s-office',
 'https://tienganhtflat.com/blog/toeic-words-museums',
 'https://tienganhtflat.com/blog/toeic-words-business-planning',
 'https://tienganhtflat.com/blog/toeic-words-ordering-lunch',
 'https://tienganhtflat.com/blog/toeic-words-media',
 'https://tienganhtflat.com/blog/toeic-words-cooking-as-a-career',
 'https://tienganhtflat.com/blog/toeic-words-selecting-a-restaurant',
 'https://tienganhtflat.com/blog/toeic-words-office-procedures',
 'https://tienganhtflat.com/blog/toeic-words-promotions-pensions-award',
 'https://tienganhtflat.com/blog/toeic-words-correspondence',
 'https://tienganhtflat.com/blog/toeic-words-health',
 'https://tienganhtflat.com/blog/toeic-words-financial-statements',
 'https://tienganhtflat.com/blog/toeic-words-ordering-supplies',
 'https://tienganhtflat.com/blog/toeic-words-computers-and-the-in

## Playing around a topic

In [7]:
# Extracting word list in a topic url
res = requests.get(topic_urls[0])
soup = BeautifulSoup(res.text, parser='html.parser')
words_list = soup.findAll('div', attrs={'class': 'col-md-12 item-content clearfix'})

In [8]:
print('Total words:', len(words_list))

Total words: 12


In [9]:
# Preparing Regex pattern for a word
type_pattern = re.compile('\((.*)\)')
pronounce_pattern = re.compile('/(.*)/')
explain_pattern = re.compile('<b>Giải thích:</b>(.*)<br/><b>Ex:</b>')
example_pattern = re.compile('<b>Ex:</b>(.*)\n?<br/><i>')

In [10]:
# Crating a function to extract with regex safely
def safe_regex_extract(pattern, string, group):
    match = re.search(pattern, string)
    if match:
        return match.group(group).strip()
    else:
        return np.nan

In [11]:
# The main function to extract data of a word
def extract_word(word):
    p_1 = word.find('p').text
    p_2 = word.findAll('p')[-1].__str__()
    
    # Extracting with soup
    image_url = word.find('img').get('src')
    english = word.find('b').text # English placed in <b> tag
    vietnamese = str(word.find('i').text).capitalize() # Vietnamese placed in the first <i> tag
    audio_url = word.find('a', attrs={'href': re.compile('\.mp3$')}).get('href') # Audio URL has href end with ".mp3"
    example_vietnamese = word.findAll('p')[-1].find('i').text # Example placed in <i> tag of the second <p> tag
    
    # Extracting with regex
    type = safe_regex_extract(type_pattern, p_1, 1)
    pronounce = safe_regex_extract(pronounce_pattern, p_1, 0)
    explain = safe_regex_extract(explain_pattern, p_2, 1)
    example = safe_regex_extract(example_pattern, p_2, 1)
    
    # Arrange data
    data = {
        'english': english,
        'type': type,
        'vietnamese': vietnamese,
        'pronounce': pronounce,
        'explain': explain,
        'example': example,
        'example_vietnamese': example_vietnamese,
        'image_url': image_url,
        'audio_url': audio_url,
    }
    
    return data

In [12]:
# Testing our functions
word = extract_word(words_list[0])
pprint(word, sort_dicts=False)

{'english': 'adjustment',
 'type': 'n.',
 'vietnamese': 'Điều chỉnh, chỉnh lý',
 'pronounce': "/ə'dʤʌstmənt/",
 'explain': 'a small change made to something in order to correct or improve '
            'it',
 'example': "I've made a few adjustments to the design.",
 'example_vietnamese': 'Tôi đã thực hiện một vài điều chỉnh để thiết kế.',
 'image_url': 'https://audio.tflat.vn/data/cache/images/300x225/a/d/adjustment1.png',
 'audio_url': 'https://audio.tflat.vn/audio/a/d/adjustment.mp3'}


## Scraping in bulk

In [13]:
word_data = []
total_topics = len(topic_urls)
for index, topic_url in enumerate(topic_urls):
    res = requests.get(topic_url)
    soup = BeautifulSoup(res.text, parser='html.parser')
    words_list = soup.findAll('div', attrs={'class': 'col-md-12 item-content clearfix'})
    topic_name = soup.find('h1').text
    
    for word in words_list:
        data = extract_word(word)
        data['topic'] = topic_name
        data['topic_url'] = topic_url
        word_data.append(data)
        
    print(f"[{index+1}/{total_topics}] {topic_name}")

[1/50] TOEIC WORDS - Inventory
[2/50] TOEIC WORDS - Theater
[3/50] TOEIC WORDS - Dentist's Office
[4/50] TOEIC WORDS - Museums
[5/50] TOEIC WORDS - Business Planning
[6/50] TOEIC WORDS - Ordering Lunch
[7/50] TOEIC WORDS - Media
[8/50] TOEIC WORDS - Cooking As A Career
[9/50] TOEIC WORDS - Selecting A Restaurant
[10/50] TOEIC WORDS - Office Procedures
[11/50] TOEIC WORDS - Promotions, Pensions & Award
[12/50] TOEIC WORDS - Correspondence
[13/50] TOEIC WORDS - Health
[14/50] TOEIC WORDS - Financial Statements
[15/50] TOEIC WORDS - Ordering Supplies
[16/50] TOEIC WORDS - Computers and the Internet
[17/50] TOEIC WORDS - Invoice
[18/50] TOEIC WORDS - Shipping
[19/50] TOEIC WORDS - Taxes
[20/50] TOEIC WORDS - Quality Control
[21/50] TOEIC WORDS - Banking
[22/50] TOEIC WORDS - Shopping
[23/50] TOEIC WORDS - Marketing
[24/50] TOEIC WORDS - Property & Departments
[25/50] TOEIC WORDS - Office Technology
[26/50] TOEIC WORDS - Conference
[27/50] TOEIC WORDS - Events
[28/50] TOEIC WORDS - Hiring a

In [14]:
df = pd.DataFrame(word_data)

In [15]:
print('Total words:', df.shape[0])

Total words: 615


In [16]:
# Preview 30 rows
df.head(30)

Unnamed: 0,english,type,vietnamese,pronounce,explain,example,example_vietnamese,image_url,audio_url,topic,topic_url
0,adjustment,n.,"Điều chỉnh, chỉnh lý",/ə'dʤʌstmənt/,a small change made to something in order to c...,I've made a few adjustments to the design.,Tôi đã thực hiện một vài điều chỉnh để thiết kế.,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/a/d/adjustment.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
1,automatically,adv.,‹một cách› tự động,/ˌɔːtəˈmætɪkli/,having controls that work without needing a pe...,The door opens automatically.,Cửa mở một cách tự động.,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/a/u/automatically...,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
2,crucial,adj.,"Cốt yếu, chủ yếu, có tính quyết định",/'kru:ʃjəl/,"extremely important, because it will affect ot...",Inventory is a crucial process and must be tak...,Kiểm kê là một quá trình cốt yếu và phải được ...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/c/r/crucial.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
3,discrepancy,n.,"‹sự› khác nhau, trái ngược nhau",/dis'krepənsi/,a difference between two or more things that s...,We easily explained the discrepancy between th...,Chúng tôi dễ dàng giải thích sự khác nhau giữa...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/d/i/discrepancy.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
4,disturb,v.,"Quấy rầy, làm phiền",/dis'tə:b/,to interrupt somebody when they are trying to ...,Let's see how many products we can count in ad...,Hãy xem có bao nhiêu sản phẩm chúng ta có thể ...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/d/i/disturb.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
5,liability,n.,"Nguy cơ, điều gây khó khăn trở ngại","/,laiə'biliti/",the state of being legally responsible for som...,The slippery steps were a terrible liability f...,Những bậc thềm trơn trượt là một nguy cơ tệ hạ...,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/l/i/liability.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
6,reflection,n.,"‹sự› phản chiếu, phản xạ, hình ảnh",/rɪˈflekʃn/,a sign that shows the state or nature of somet...,She saw her reflection in the mirror.,Cô ấy nhìn ảnh của mình ở trong gương.,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/r/e/reflection.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
7,run,v.,"Chạy, vận hành",/rʌn/,to operate or function,"As long as the computer is running, you can ke...","Trong khi máy tính đang chạy, bạn có thể thực ...",https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/r/u/run.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
8,scan,v.,"Xem lướt, xem qua",/skæn/,"to look at every part of something carefully, ...",He scanned through the newspaper over breakfast.,Anh ấy đã xem qua tờ báo trong lúc ăn sáng.,https://audio.tflat.vn/data/images_example/300...,https://audio.tflat.vn/audio/s/c/scan.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...
9,subtract,v.,"Trừ đi, khấu trừ; loại ra, lấy ra khỏi",/səb'trækt/,to take a number or an amount away from anothe...,6 subtracted from 9 is 3.,9 trừ 6 được 3.,https://audio.tflat.vn/data/cache/images/300x2...,https://audio.tflat.vn/audio/s/u/subtract.mp3,TOEIC WORDS - Inventory,https://tienganhtflat.com/blog/toeic-words-inv...


In [17]:
# Removing unnecessary text in topic name
df['topic'] = df['topic'].str.replace('TOEIC WORDS - ', '')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   english             615 non-null    object
 1   type                612 non-null    object
 2   vietnamese          615 non-null    object
 3   pronounce           610 non-null    object
 4   explain             586 non-null    object
 5   example             615 non-null    object
 6   example_vietnamese  615 non-null    object
 7   image_url           615 non-null    object
 8   audio_url           615 non-null    object
 9   topic               615 non-null    object
 10  topic_url           615 non-null    object
dtypes: object(11)
memory usage: 53.0+ KB


In [19]:
df.to_excel('data/toeic_600_words.xlsx', index=False)
df.to_csv('data/toeic_600_words.csv', index=False, encoding='utf-8-sig')

## Scraping media

In [20]:
# Preparing folders
media_folder = "media"
images_folder = "media/images"
audio_folder = "media/audio"

os.makedirs(media_folder, exist_ok=True)
os.makedirs(images_folder, exist_ok=True)
os.makedirs(audio_folder, exist_ok=True)

for topic in df['topic'].unique():
    topic_image_folder = f"{images_folder}/{topic}"
    topic_audio_folder = f"{audio_folder}/{topic}"
    os.makedirs(topic_image_folder, exist_ok=True)
    os.makedirs(topic_audio_folder, exist_ok=True)

In [21]:
# The main function to download file
def download_file(url, folder):
    file_name = url.split('/')[-1]
    res = requests.get(url, stream=True)
    if res.status_code == 200:
        with open(f"{folder}/{file_name}", "wb") as f:
            for chunk in res.iter_content(chunk_size=1024): # 1KB
                f.write(chunk)
        print(f"[Downloaded] {folder}/{file_name}")
    else:
        print(f"[Error] {url}")

In [22]:
# Downloading media
for index, row in df.iterrows():
    topic_image_folder = f"{images_folder}/{row.topic}"
    topic_audio_folder = f"{audio_folder}/{row.topic}"
    download_file(row.image_url, topic_image_folder)
    download_file(row.audio_url, topic_audio_folder)

[Downloaded] media/images/Inventory/adjustment1.png
[Downloaded] media/audio/Inventory/adjustment.mp3
[Downloaded] media/images/Inventory/automatically1.jpg
[Downloaded] media/audio/Inventory/automatically.mp3
[Downloaded] media/images/Inventory/crucial1.png
[Downloaded] media/audio/Inventory/crucial.mp3
[Downloaded] media/images/Inventory/discrepancy1.jpg
[Downloaded] media/audio/Inventory/discrepancy.mp3
[Downloaded] media/images/Inventory/disturb1.png
[Downloaded] media/audio/Inventory/disturb.mp3
[Downloaded] media/images/Inventory/liability1.png
[Downloaded] media/audio/Inventory/liability.mp3
[Downloaded] media/images/Inventory/reflection2.jpg
[Downloaded] media/audio/Inventory/reflection.mp3
[Downloaded] media/images/Inventory/run1.png
[Downloaded] media/audio/Inventory/run.mp3
[Downloaded] media/images/Inventory/he_scanned_ex1_56248d157f8b9a040dc65f84.png
[Downloaded] media/audio/Inventory/scan.mp3
[Downloaded] media/images/Inventory/subtract1.png
[Downloaded] media/audio/Inven

## Doing some statistic

In [23]:
df['type'].value_counts(normalize=True) * 100

type
n.         41.666667
v.         37.418301
adj.       13.235294
adv.        6.372549
n,v.        0.326797
phr.v.      0.163399
n, v.       0.163399
n.ph.       0.163399
perp.       0.163399
v, n.       0.163399
phr. v.     0.163399
Name: proportion, dtype: float64

In [24]:
topic_word_count = df['topic'].value_counts()

In [25]:
topic_word_count.describe()

count    50.00000
mean     12.30000
std       0.46291
min      12.00000
25%      12.00000
50%      12.00000
75%      13.00000
max      13.00000
Name: count, dtype: float64