In [1]:
#!pip install selenium

### Magoosh 600 (Web Scraping)
https://toefl.magoosh.com/flashcards/vocabulary/decks

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
import csv
import re

In [3]:
class MagooshDataRetriever:
    def __init__(self):
        self.data = []
    
    def start_crawling(self, initial_url):
        # Set up the Selenium WebDriver
        driver = webdriver.Chrome()

        try:
            driver.get(initial_url)
            visited_words = set()

            while True:
                flashcard = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "flashcard"))
                )

                front_content = flashcard.find_element(By.CLASS_NAME, "front")
                word = front_content.find_element(By.CLASS_NAME, "flashcard-word").text.strip()

                if word in visited_words:
                    print("Already visited")
                    break

                visited_words.add(word)

                # Click the "Click to see definition and example" button
                see_definition_button = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, "a.card-footer"))
                )
                see_definition_button.click()

                self.extract_flashcard_data(flashcard)

                # Click the "I knew this word" button
                next_button = WebDriverWait(driver, 10).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, "a.card-footer.card-footer-success.text-center"))
                )
                WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "a.card-footer.card-footer-success.text-center"))
                )
                driver.execute_script("arguments[0].click();", next_button)

                # Wait for the next flashcard to load
                WebDriverWait(driver, 10).until(
                    EC.staleness_of(flashcard)
                )

        except Exception as e:
            print("An error occurred:", str(e))

        finally:
            driver.quit()

        # Create a DataFrame from the data
        df = pd.DataFrame(self.data, columns=['word', 'definition', 'part of speech', 'example'])
        df['source'] = 'Magoosh'
        return df

    def extract_flashcard_data(self, flashcard):
        back_content = flashcard.find_element(By.CLASS_NAME, "back")
        word = back_content.find_element(By.CLASS_NAME, "flashcard-word").text.strip()
        
        definition_elements = back_content.find_elements(By.CLASS_NAME, "flashcard-text")
        pos_list = []
        definitions = []
        for element in definition_elements:
            text = element.text.strip()
            pos, definition = self.extract_pos_and_definition(text)
            pos_list.append(pos)
            definitions.append(definition)
        
        example_elements = back_content.find_elements(By.CLASS_NAME, "flashcard-example")
        examples = ' '.join([element.text.strip() for element in example_elements])
        
        # Append the data to the data list
        self.data.append([word, ' '.join(definitions), ', '.join(pos_list), examples])
        
    @staticmethod
    def extract_pos_and_definition(text):
        parts = text.split(':')
        pos = parts[0].strip()
        definition = ':'.join(parts[1:]).strip()
        return pos, definition

In [4]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/common-1/issue'
retriever = MagooshDataRetriever()
df = retriever.start_crawling(initial_url)
print(df)

Already visited
             word                                         definition  \
0           issue       a topic or situation to talk about a problem   
1          modify                    to make small changes or add to   
2       establish  to build, to create (a system, organization, o...   
3         concept                                   an abstract idea   
4         overall  looking at or including all pieces or factors ...   
5          assess  to look closely at and figure out the value or...   
6         whereas   although (used to contrast two different things)   
7         predict         to say something will happen in the future   
8          adjust  to change or move a little and make better or ...   
9        approach  to move closer to something or someone a way t...   
10       contrast  to be clearly different from to look specifica...   
11         method                     how to do something, a process   
12       instance                               

In [5]:
df.head()

Unnamed: 0,word,definition,part of speech,example,source
0,issue,a topic or situation to talk about a problem,"noun, noun","In order to define what jazz music is, we need...",Magoosh
1,modify,to make small changes or add to,verb,The first bicycles were slow and difficult to ...,Magoosh
2,establish,"to build, to create (a system, organization, o...",verb,The U.S. government was established in the lat...,Magoosh
3,concept,an abstract idea,noun,Some concepts within advanced mathematics are ...,Magoosh
4,overall,looking at or including all pieces or factors ...,"adjective, adverb","Some words might be new and difficult, but the...",Magoosh


In [6]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/easy/react'
retriever = MagooshDataRetriever()
df_e = retriever.start_crawling(initial_url)

df_e.head()

Already visited


Unnamed: 0,word,definition,part of speech,example,source
0,react,to respond; to act in answer to,verb,"Mothers react quickly to their babies' cries, ...",Magoosh
1,complex,complicated; having many different parts and d...,adjective,The science behind the atomic bomb used in Wor...,Magoosh
2,comment,"an observation, note, or response to say a sho...","noun, verb",The professor was angry about the negative com...,Magoosh
3,achieve,to finally do something you have been trying h...,verb,The best students know that to achieve good gr...,Magoosh
4,participate,to join or take part (in an activity),verb,The teacher was surprised when every student p...,Magoosh


In [7]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/medium/potential'
retriever = MagooshDataRetriever()
df_m = retriever.start_crawling(initial_url)

Already visited


In [8]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/medium-2/absent'
retriever = MagooshDataRetriever()
df_m2 = retriever.start_crawling(initial_url)

Already visited


In [9]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/medium-3/bulky'
retriever = MagooshDataRetriever()
df_m3 = retriever.start_crawling(initial_url)

Already visited


In [10]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/hard/deduce'
retriever = MagooshDataRetriever()
df_h = retriever.start_crawling(initial_url)

Already visited


In [11]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/hard-2/recede'
retriever = MagooshDataRetriever()
df_h2 = retriever.start_crawling(initial_url)

Already visited


In [12]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/hard-3/fracture'
retriever = MagooshDataRetriever()
df_h3 = retriever.start_crawling(initial_url)

Already visited


In [13]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/hard-4/rural'
retriever = MagooshDataRetriever()
df_h4 = retriever.start_crawling(initial_url)

Already visited


In [14]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/very-hard-1/overshadow'
retriever = MagooshDataRetriever()
df_vh = retriever.start_crawling(initial_url)

Already visited


In [15]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/very-hard-2/jeopardy'
retriever = MagooshDataRetriever()
df_vh2 = retriever.start_crawling(initial_url)

Already visited


In [16]:
initial_url = 'https://toefl.magoosh.com/flashcards/vocabulary/very-hard-3/retract-efl'
retriever = MagooshDataRetriever()
df_vh3 = retriever.start_crawling(initial_url)

Already visited


In [17]:
toefl_vocab = pd.concat([df, df_e, df_m, df_m2, df_m3, df_h, df_h2, df_h3, df_h4, df_vh, df_vh2, df_vh3])

toefl_vocab.info()

<class 'pandas.core.frame.DataFrame'>
Index: 583 entries, 0 to 49
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   word            583 non-null    object
 1   definition      583 non-null    object
 2   part of speech  583 non-null    object
 3   example         583 non-null    object
 4   source          583 non-null    object
dtypes: object(5)
memory usage: 27.3+ KB


In [18]:
toefl_vocab.head()

Unnamed: 0,word,definition,part of speech,example,source
0,issue,a topic or situation to talk about a problem,"noun, noun","In order to define what jazz music is, we need...",Magoosh
1,modify,to make small changes or add to,verb,The first bicycles were slow and difficult to ...,Magoosh
2,establish,"to build, to create (a system, organization, o...",verb,The U.S. government was established in the lat...,Magoosh
3,concept,an abstract idea,noun,Some concepts within advanced mathematics are ...,Magoosh
4,overall,looking at or including all pieces or factors ...,"adjective, adverb","Some words might be new and difficult, but the...",Magoosh


In [19]:
toefl_vocab.to_csv('magoosh_600_raw.csv',index=False)

### Barron 450 from Quizlet
https://quizlet.com/16271832/barrons-450-essential-words-for-the-toefl-flash-cards/

In [20]:
df_barron = pd.read_csv('./barron_raw.csv',header=None)
df_barron = df_barron.rename(columns={0:'word',1:'definition'})

df_barron.head()

Unnamed: 0,word,definition
0,aggravating,irritating
1,adj. making worse; annoying,
2,amusement,diversion
3,n. something that holds interest and is enjoyable,
4,conceivably,possibly


In [21]:
df_barron.tail(10)

Unnamed: 0,word,definition
1161,potential,"n. an ability, happening, or opportunity that ..."
1162,syn. possibility,
1163,propose,v. to suggest or plan to do something
1164,syn. suggest,
1165,restore,v. to give back or bring back something; to re...
1166,syn. revitalize,
1167,turbulent,"adj. to be in a disordered, disturbed or unsta..."
1168,syn. chaotic,
1169,vital,adj. of great importance; full of life
1170,syn. indispensable,


In [22]:
df1 = df_barron.iloc[0:991]
df2 = df_barron.iloc[991::2]

df1.head(10)

Unnamed: 0,word,definition
0,aggravating,irritating
1,adj. making worse; annoying,
2,amusement,diversion
3,n. something that holds interest and is enjoyable,
4,conceivably,possibly
5,adv. feasibly; believable,
6,convert,alter
7,v. to change from one form or state to another,
8,curative,healing.
9,adj. being able to restore to good condition,


In [23]:
df1 = df1.drop(columns=['definition'])
df1 = df1.dropna()

even_rows = df1.iloc[1::2].reset_index(drop=True)
df1 = df1.iloc[::2].reset_index(drop=True)

for i in range(len(df1.columns)):
    df1['definition'] = even_rows.iloc[:, i]
    
df1.head(10)

Unnamed: 0,word,definition
0,aggravating,adj. making worse; annoying
1,amusement,n. something that holds interest and is enjoyable
2,conceivably,adv. feasibly; believable
3,convert,v. to change from one form or state to another
4,curative,adj. being able to restore to good condition
5,debilitating,adj. weakening
6,deplete,v. to use up; to reduce greatly
7,finite,adj. of a certain amount; having an end; not i...
8,perceive,v. to sense; to become aware of
9,security,"n. the feeling of freedom from danger, doubt, ..."


In [24]:
df2.head(10)

Unnamed: 0,word,definition
991,ambiguous,adj. of unclear meaning; something that can be...
993,arbitrary,adj. an action or decision made with little th...
995,assert,v. to express or defend oneself strongly; to s...
997,astounding,adj. very surprising
999,astute,"adj. very intelligent, smart, clever"
1001,concur,v. to have the same opinion or draw the same c...
1003,deceptively,adv. making something appear true or good when...
1005,designate,"v. to specify, name, or select to do to a task..."
1007,determined,"adj. strong in one's opinion, firm in convicti..."
1009,elicit,v. to get the facts or withdraw out the truth


In [25]:
df = pd.concat([df1, df2], ignore_index=True)

df.head(10)

Unnamed: 0,word,definition
0,aggravating,adj. making worse; annoying
1,amusement,n. something that holds interest and is enjoyable
2,conceivably,adv. feasibly; believable
3,convert,v. to change from one form or state to another
4,curative,adj. being able to restore to good condition
5,debilitating,adj. weakening
6,deplete,v. to use up; to reduce greatly
7,finite,adj. of a certain amount; having an end; not i...
8,perceive,v. to sense; to become aware of
9,security,"n. the feeling of freedom from danger, doubt, ..."


In [26]:
df['part of speech'] = df['definition'].apply(lambda x: x.split('.')[0] if '.' in x else np.NaN)
df['definition'] = df['definition'].apply(lambda x: ' '.join(x.split('.')[1:]) if '.' in x else x)
df['source'] = 'Barron'

df.head()

Unnamed: 0,word,definition,part of speech,source
0,aggravating,making worse; annoying,adj,Barron
1,amusement,something that holds interest and is enjoyable,n,Barron
2,conceivably,feasibly; believable,adv,Barron
3,convert,to change from one form or state to another,v,Barron
4,curative,being able to restore to good condition,adj,Barron


In [27]:
df = df.dropna(axis=0)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 449 entries, 0 to 449
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   word            449 non-null    object
 1   definition      449 non-null    object
 2   part of speech  449 non-null    object
 3   source          449 non-null    object
dtypes: object(4)
memory usage: 17.5+ KB


In [28]:
df['part of speech'].unique()

array(['adj', 'n', 'adv', 'v', 'a', 'av', 'conj', 'prep'], dtype=object)

In [29]:
# Mapping of abbreviations to their full names
abbreviation_mapping = {
    'adj': 'adjective',
    'n': 'noun',
    'adv': 'adverb',
    'v': 'verb',
    'a': 'article',
    'av': 'adverb',
    'conj': 'conjunction',
    'prep': 'preposition'
}

df['part of speech'] = df['part of speech'].map(abbreviation_mapping)

df.head()

Unnamed: 0,word,definition,part of speech,source
0,aggravating,making worse; annoying,adjective,Barron
1,amusement,something that holds interest and is enjoyable,noun,Barron
2,conceivably,feasibly; believable,adverb,Barron
3,convert,to change from one form or state to another,verb,Barron
4,curative,being able to restore to good condition,adjective,Barron


In [30]:
df.to_csv('barron_450_raw.csv',index=False)

### Quizlet TOEFL Vocabulary Practice
https://quizlet.com/exams/toefl/toefl-vocabulary-e473ccd-s01

In [31]:
def convert_txt_to_csv(input_file, output_file):
    with open(input_file, 'r') as file:
        content = file.read()

    elements = content.strip().split('\n')

    rows = []
    for i in range(0, len(elements), 2):
        if i + 1 < len(elements):
            word = elements[i].strip()
            definition = elements[i + 1].strip()

            # extract the POS from the definition
            match = re.match(r'\((\w+)\)', definition)
            if match:
                pos = match.group(1)
                definition = definition.replace(f'({pos})', '').strip()
            else:
                pos = ''
            rows.append([word, pos, definition])

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['word', 'part of speech', 'definition'])
        writer.writerows(rows)

In [32]:
convert_txt_to_csv('toefl_science.txt', 'toefl_science.csv')

In [33]:
df_s = pd.read_csv('./toefl_science.csv')

df_s.head()

Unnamed: 0,word,part of speech,definition
0,acid,adjective,Something that contains acid or tastes sour.
1,cosmic,adjective,Relating to or resembling the universe.
2,dilute,verb,To make something thinner by adding water or o...
3,flow,verb,A steady movement of a liquid/gas/electricity.
4,framework,noun,The structure underlying a theory or concept.


In [34]:
convert_txt_to_csv('toefl_history.txt', 'toefl_history.csv')

In [35]:
df_h = pd.read_csv('./toefl_science.csv')

In [36]:
df = pd.concat([df_s, df_h], ignore_index=True)
df['source'] = 'Quizlet'

df

Unnamed: 0,word,part of speech,definition,source
0,acid,adjective,Something that contains acid or tastes sour.,Quizlet
1,cosmic,adjective,Relating to or resembling the universe.,Quizlet
2,dilute,verb,To make something thinner by adding water or o...,Quizlet
3,flow,verb,A steady movement of a liquid/gas/electricity.,Quizlet
4,framework,noun,The structure underlying a theory or concept.,Quizlet
...,...,...,...,...
263,scale,noun,A machine that is used to make measurements.,Quizlet
264,scale,noun,A series of values used as a reference for mea...,Quizlet
265,solution,noun,"A mixture, usually liquid, in which one or mor...",Quizlet
266,specimen,noun,"A sample used for testing, study, or examination.",Quizlet


In [37]:
df.to_csv('quizlet_raw.csv',index=False)

### Merge dataframe

In [42]:
df_magoosh = pd.read_csv('./magoosh_600_raw.csv')
df_barron = pd.read_csv('./barron_450_raw.csv')
df_quizlet = pd.read_csv('./quizlet_raw.csv')

df = pd.concat([df_magoosh, df_barron, df_quizlet], ignore_index=True)
df.drop_duplicates(subset='word', inplace=True)
df['definition'] = df['definition'].str.lower()

df

Unnamed: 0,word,definition,part of speech,example,source
0,issue,a topic or situation to talk about a problem,"noun, noun","In order to define what jazz music is, we need...",Magoosh
1,modify,to make small changes or add to,verb,The first bicycles were slow and difficult to ...,Magoosh
2,establish,"to build, to create (a system, organization, o...",verb,The U.S. government was established in the lat...,Magoosh
3,concept,an abstract idea,noun,Some concepts within advanced mathematics are ...,Magoosh
4,overall,looking at or including all pieces or factors ...,"adjective, adverb","Some words might be new and difficult, but the...",Magoosh
...,...,...,...,...,...
1159,microscope,an instrument used for examining small objects...,noun,,Quizlet
1160,nucleus,an organelle that contains the genetic materia...,noun,,Quizlet
1161,scale,a machine that is used to make measurements.,noun,,Quizlet
1163,solution,"a mixture, usually liquid, in which one or mor...",noun,,Quizlet


In [44]:
df.to_csv('toefl_1000_raw.csv',index=False)