In [1]:
import pandas as pd
import requests
from lxml import html
import time

In [2]:
words = pd.read_excel('Words from GRE Vocabulary Assessment Videos.xlsx', header=None)
words = words.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [3]:
max_groups = 28
max_set_words = 30

In [31]:
# ankified = pd.DataFrame(columns=['words','definition','mnemonics','image_url'])

ankified = pd.read_csv('anki_mnemonics.csv')

In [32]:
ankified

Unnamed: 0,words,definition,mnemonics,image_url,Set
0,abound,(verb) be abundant or plentiful; exist in larg...,['Bounded means limited. Abound means not limi...,[],Set 1
1,amorphous,(adj) having no definite form or distinct shape,"[""In Greek 'morphe' means shape/form.\r\nSo, a...",[],Set 1
2,austere,(adj) severely simple,['AUS-australia TERE-tear(sorrow). like the re...,[],Set 1
3,belie,(verb) be in contradiction with,"['belie has lie.', 'belie ~ lie means somethin...",[],Set 1
4,capricious,(adj) changeable,"[""ca(CAR) + PRIC(PRICE)...PRICE OF cars nowada...",[],Set 1
5,cerebral,(adj) involving intelligence rather than emoti...,"['cereBRAINal', 'CEREBRAL ASSASIN']",[],Set 1
6,congenial,(adj) suitable to your needs,"['Genelia Desouza was suitable, appropriate ,...",['https://dailyvocab.com/wp-content/uploads/20...,Set 1
7,conspicuous,(adj) obvious to the eye or mind,['Con(can)+ s(see) + the + pic(picture) --> ca...,[],Set 1
8,cursory,(adj) hasty and without attention to detail; n...,['relate it to the cursor on your computer scr...,[],Set 1
9,daunting,(adj) discouraging through fear,['DAUN SOUNDS LIKE DON\r\nNTING SOUNDS LIKE AC...,['https://dailyvocab.com/wp-content/uploads/20...,Set 1


In [4]:
def get_my_set(data, set_no):
    '''
    Parameters: 
    
    data:- gregmat excel sheet dataframe
    set_no:- int, set no. which you want to get
    
    Returns: Series of 30 words
    '''
    column_name = 'Group '+ str(set_no)
    columns = data.columns
    for col in columns:
        if data[col].astype(str).str.fullmatch(column_name).any():
            found = True
            match_col = col
            for index, row in enumerate(data[col]):
                if row == column_name:
                    match_row = index
    
    selected = data[match_col][match_row+2 : match_row+2+30] # match_row + 2 because we don't want (Group no# or Take Test no#)
    return selected.reset_index(drop=True)

In [5]:
def get_def_mnemo(word):
    url = 'https://mnemonicdictionary.com/?word='+word

    page = requests.get(url)
    tree = html.fromstring(page.content)
    definition = tree.xpath('//li[@class="media list-group-item p-4"]/div[1]//text()')[12].strip()
    
#     print("\n\nWord: \t\t{}\nDefinition: \t{}".format(word, definition))
    
    count=1
    mnemonics = []
    for index, mnemonic_id in enumerate(range(0,7)):
        
        try:
            mnemonic = tree.xpath('//div[@class="card-text"]/p//text()')[mnemonic_id].strip()
        except:
            continue
        if mnemonic in ['','Powered by','Mnemonic Dictionary']:
            continue
        
        if count > 3:
            break
            
#         print("Mnemonic {}: \t{}".format(count, mnemonic))
        mnemonics.append(mnemonic)
        count+=1
    return definition, mnemonics

In [6]:
def get_image(word):
    image_url = '{}{}{}'.format('https://dailyvocab.com/photos/', word, '/')
    page = requests.get(image_url)
    tree = html.fromstring(page.content)
    image_url = tree.xpath('//meta[@name ="twitter:image"]/@content')
    return image_url

In [67]:
def add_set(data, word_list, set_no):
    if all([word in list(data['words']) for word in word_list ]):
        return data
    
    for word in word_list:
        definition, mnemonics = get_def_mnemo(word)
        image_url = get_image(word)

        print(word)
        print('Definition: {}'.format(definition))
        for index, mnemo in enumerate(mnemonics):
            print('Mnemonic {}: {}'.format(index+1, mnemo))
        print("Image url: {}".format(image_url))
        print('#'*125)

        time.sleep(2)
        data = data.append({'words':word,'definition':definition,'mnemonics':mnemonics,'image_url':image_url, 'Set':'Set '+str(set_no)}, ignore_index=True)
    return data

In [75]:
for set_no in range(1,5):
    word_list = list(get_my_set(words, set_no))
    ankified = add_set(ankified, word_list, set_no)

In [72]:
ankified

Unnamed: 0,words,definition,mnemonics,image_url,Set
0,abound,(verb) be abundant or plentiful; exist in larg...,['Bounded means limited. Abound means not limi...,[],Set 1
1,amorphous,(adj) having no definite form or distinct shape,"[""In Greek 'morphe' means shape/form.\r\nSo, a...",[],Set 1
2,austere,(adj) severely simple,['AUS-australia TERE-tear(sorrow). like the re...,[],Set 1
3,belie,(verb) be in contradiction with,"['belie has lie.', 'belie ~ lie means somethin...",[],Set 1
4,capricious,(adj) changeable,"[""ca(CAR) + PRIC(PRICE)...PRICE OF cars nowada...",[],Set 1
...,...,...,...,...,...
115,punctilious,(adj) marked by precise accordance with details,[it is very close to word PUNCTUAL...and you p...,[],Set 4
116,recondite,(adj) difficult to penetrate; incomprehensible...,"[read recondite as ""re conduct"". The professor...",[https://dailyvocab.com/wp-content/uploads/201...,Set 4
117,scrupulous,(adj) having scruples; arising from a sense of...,[it makes u think of sculpture and while build...,[],Set 4
118,tranquil,(adj) (of a body of water) free from disturban...,"[it is like tank fill,when a tank is filled ev...",[],Set 4


In [119]:
ankified['image_url'] = ankified['image_url'].apply(lambda x: x.strip('[]\'') if isinstance(x, str) else x) # Removing '[', ']', ''' (comma) from omage_url
ankified['image_url'] = ankified['image_url'].apply(lambda x: None if isinstance(x, list) and len(x)==0 else x ) # Removing empty lists from url
ankified['image_url'] = ankified['image_url'].apply(lambda x: x[0] if isinstance(x, list) else x ) # list to string

In [120]:
ankified['image_url'].value_counts()

https://dailyvocab.com/wp-content/uploads/2013/09/Abate.jpg           1
https://dailyvocab.com/wp-content/uploads/2015/01/ambivalent-1.png    1
https://dailyvocab.com/wp-content/uploads/2020/03/puerile.gif         1
https://dailyvocab.com/wp-content/uploads/2020/03/Castigate.gif       1
https://dailyvocab.com/wp-content/uploads/2013/08/vacillate.jpg       1
https://dailyvocab.com/wp-content/uploads/2013/10/diffidence.jpg      1
https://dailyvocab.com/wp-content/uploads/2015/07/contrite.jpg        1
https://dailyvocab.com/wp-content/uploads/2020/01/Advocate.gif        1
https://dailyvocab.com/wp-content/uploads/2020/02/avaricious.gif      1
https://dailyvocab.com/wp-content/uploads/2020/10/Deference.gif       1
https://dailyvocab.com/wp-content/uploads/2020/03/Placate.gif         1
https://dailyvocab.com/wp-content/uploads/2013/10/emulate.jpg         1
https://dailyvocab.com/wp-content/uploads/2019/08/Flout.jpg           1
https://dailyvocab.com/wp-content/uploads/2020/07/burgeoning-1.g

In [117]:
ankified.to_csv('anki_mnemonics.csv', index = None)

0                                                   None
1                                                   None
2                                                   None
3                                                   None
4                                                   None
                             ...                        
115                                                 None
116    https://dailyvocab.com/wp-content/uploads/2013...
117                                                 None
118                                                 None
119    https://dailyvocab.com/wp-content/uploads/2013...
Name: image_url, Length: 120, dtype: object

# Requests and BS4

In [29]:
page = requests.get(image_url)
page.status_code

200

In [30]:
from bs4 import BeautifulSoup

In [31]:
soup = BeautifulSoup(page.text, 'html.parser')

In [32]:
soup.find_all('p')

[<p class="site-title">
 <a href="https://dailyvocab.com/">
 
 </a>
 </p>,
 <p class="site-title">
 <a href="https://dailyvocab.com/">
 
 </a>
 </p>,
 <p><strong>Obsequious [<spa

In [33]:
soup.find_all('p')[2].get_text()

'Obsequious\xa0[uh b-see-kwee-uh s]'

In [14]:
# print(soup.prettify())