

<img src="https://dle.rae.es/app/doc/es/img/dle.jpg"
    style="width:100px; float: right; margin: 0 40px 40px 40px;"></img>

<hr style="margin-bottom: 40px;">

# Data analysis with RAE words and definitions
<hr style="margin-bottom: 1px;">

## 1. Theoretical introduction
####      1.1 Libraries and modules used

* Pandas: Data analysis

* Numpy: Efficient calculus

* Matplotlib: Data representation

* Requests and Urllib: Obtain the HTML from web-pages

* BeautifulSoup: Parse the obtained HTML from web-pages

* LXML (module): Process the HTML code

## 2 How does the program work?
#### 2.1. The words list

#### 2.2. Obtaining the data from each word
The words are obtained from: https://www.listapalabras.com/

We use a function called word_data(word) that extracts all the data from the dictionary entry of the word.

    Definitions URL: https://dle.rae.es/{'word'}
#### 2.3. Type of data extracted from each word
* Different entries
* Ethimological origin
* Gramatical word type
* Definitions




## 3 The program:

In [6]:
# All the libraries

from bs4 import Tag, NavigableString, BeautifulSoup
import requests
from urllib.request import Request, urlopen
import urllib.parse 
import urllib.error
import csv

import time

In [7]:
#The files where data is going to be stored:

#Every line in dictionary contains:
#  |word|article_i|def_j|[Abbr1, Abbr2, ...]|Definition|
f=open('dictionary_RAE_all.csv', 'w', encoding='UTF8')
dic_file = csv.writer(f)
dic_file.writerow(['Word', 'Article_number', 'Def_number', 'Abbreviation', 'Definition'])

#Every line in etymology contains:
#  |word|article_i|Ethimological origin|
g=open('etymology_RAE_all.csv', 'w', encoding='UTF8')
etm_file = csv.writer(g)
etm_file.writerow(['Word', 'Article Num.', 'Ethimological origin'])

40

In [8]:
# This downloads the HTML file from an URL

UA="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"

url="https://dle.rae.es/"


In [9]:
# This loads the words that want to be analised in a list of string
with open('words_all_list.csv', encoding="utf8") as csv_file:
    reader = csv.reader(csv_file)
    words = list(reader)

In [None]:
#This part extracts all the desired data from the HTML downloaded
begin_time = time.time()

n_word=0
for word in words: #loop for every word to be searched
    
    for try_ in range(1,6): #The code will try this 5 times if it does not work
        try: 
            req = Request(url+urllib.parse.quote(word[0]), headers={'User-Agent': UA})
            soup = BeautifulSoup(urlopen(req), 'html.parser')
            break
        except: #the exception is designed for connection errors (eg. HTTP 520), and just waits and tries again
            time.sleep(5)
            req = Request(url+urllib.parse.quote(word[0]), headers={'User-Agent': UA})
            soup = BeautifulSoup(urlopen(req), 'html.parser')
        
    i=0
    for articles in soup.find_all('header',class_='f'): # loop for the different articles that has every word
        i=i+1
        j=0
        article=articles.parent

        etm_text='' #Etimology
        
        if article.find_all('p',class_='n2')!=[]: #Check if this definition has a known etymology
            for child in article.find_all('p',class_='n2')[0]: #loop for every element in etymology
                if child.name=='abbr': #prints the abbreviation
                    etm_text = etm_text + str('{} '.format(child['title']))
                elif isinstance(child, NavigableString) or child.name!='sup' or child.name!='sub': # prints all normal words
                    etm_text = etm_text + str(child.string)
            #The ETYMOLOGY entry
            etm_file.writerow([word,'Article{}'.format(i),etm_text])

        elif article.find_all('p',class_='n3')!=[]: #Check if this definition has a known etymology (other type)
            for child in article.find_all('p',class_='n3')[0]: #loop for every element in etymology
                if child.name=='abbr': #prints the abbreviation
                    etm_text = etm_text + str('{} '.format(child['title']))
                elif isinstance(child, NavigableString) or child.name!='sup' or child.name!='sub': # prints all normal words
                    etm_text = etm_text + str(child.string)
                #The ETYMOLOGY entry
                etm_file.writerow([word,'Article{}'.format(i),etm_text])

            
        for definition in article.find_all('p',class_=['j', 'j1']):  #loop between every definition
            j=j+1
            def_text='' #Definition
            def_abbr=[] #Abbreviations
            for child in definition: #loop for every element in definition

                if not (child.name=='span' and child['class']=='h'): #removes examples
                    
                    if child.name=='abbr': #prints the abbreviation
                        if child['title']=='por ejemplo': #removes abbr examples, this should be manually revised !!
                            def_abbr.append(child['title'])
                            break
                        else:
                            def_abbr.append(child['title'])
                            def_text = def_text + str(child['title'])
                            
                    elif child.name=='sub': #subindices
                        def_text = def_text + str('_{}'.format(child.string))
                        
                    elif child.name=='sup': #superindices
                        def_text = def_text + str('^{}'.format(child.string))

                    elif isinstance(child, NavigableString) or child.has_attr("data-id") or \
                    child.name=='a' or child.name=='i' or child['class'][0]=='u':
                        if child.name=='a': #for synonims
                            def_text = def_text + ''.join([char for char in child.get_text(' ') if not char.isdigit()])
                        elif isinstance(child, Tag) and len(child.find_all()) != 0:
                            def_text = def_text + ''.join([str(char.string) for char in child])
                        else: # prints all normal words
                            def_text = def_text + str(child.string)
                            
            #The DICTIONARY entry
            dic_file.writerow([word[0],i,j, '; '.join(def_abbr), def_text.strip()])
    n_word+=1
    print('Computed: {}%'.format(round(n_word/len(words)*100, 3)), end = "\r")

f.close()
g.close()

end_time = time.time() - begin_time

print("Total execution time: {} minutes.".format(end_time/60))

Computed: 51.469%

## 4 Data analysis

In [None]:
# All the libraries

import numpy as np
import pandas as pd
from string import punctuation
    

In [None]:
#Transform the string data to list type

df = pd.read_csv('dictionary_RAE_c.csv')
for i in range(0,len(df)):
    df['Abbreviation'][i]=df['Abbreviation'][i].split("; ")

In [None]:
#Reformat the abbreviations list into a dictionary

with open('abbreviation_list_v1.csv', newline='', encoding='UTF8') as m:
    reader = csv.reader(m, )
    abr_list = list(reader)

abr_list=abr_list[1:]

def Convert(tup, di):
    for a, b in tup:
        di.setdefault(a, []).append(b)
    return di

abbr_dic = {}
abbr_dic = Convert(abr_list, abbr_dic)

In [None]:
# Reformat the df removing unnecessary abbreviations and reformat the definitionc in (‖'def.')

for word in range(0,len(df)):
    for abbr in df['Abbreviation'][word]:
        try:
            if abbr_dic[abbr]==1:
                df['Abbreviation'][word]=df['Abbreviation'][word].remove(abbr)
            else:
                df['Definition'][word]=df['Definition'][word].replace(abbr,'')
        except:
            pass
    # Remove extra spaces
    df['Definition'][word]=' '.join(df['Definition'][word].strip().split())
    
    # Format the definition inline
    df['Definition'][word]=df['Definition'][word].replace(' (‖',',').replace(')',', ')
    
    # Remove spare punctuation
    end=-len(df['Definition'][word])
    while df['Definition'][word][-1] in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ':
        df['Definition'][word]= df['Definition'][word][0:-1]
            
    # Ensure initial upercase and end with '.'
    df['Definition'][word]=df['Definition'][word][0].upper() + df['Definition'][word][1:] + '.'


In [None]:
df.to_csv(r'dictionary_RAE_c_df.csv', index = False, encoding='utf-8')
df[0:51]