# Letter Counter - Contador de letras


Un pequeño notebook para contar la frecuencia de las letras en el castellano (o en cualquier idioma, siempre que le pases la lista de letras en un CSV).

Esto que estas leyendo, es un Jupyter notebook. Puedes utilizarlo como documeto, o como herramienta ejecutable (y que permite modificaciones de forma interactiva). Para saber mas sobre como ejecutarlo, mira la documentacion en [jupyter.org](https://jupyter.org/) y revisa el Makefile.


### Loading Internal Assets

You can ignore this part.


In [1]:
import unidecode
from collections import defaultdict

# Loading libraries
import pandas as pd
import matplotlib

### Configuration - Configuracion

If you want to change settings, like data sources.

---

Si necesitas cambiar configuracion, como los acentos o la fuente de datos.

In [2]:

# Distinguish accent (True = i and í are 2 different letters; False = they are 2 letter i's)
accent_dif = False # False to ignore, as we won't be using them either for the exercises.

# Data sources
wordlist_fileSHORT = r'../data/Anexo I - palabras 3-9 letras.xls'
wordlist_fileLONG = r'../data/Anexo II - palabras 10-16 letras.OLD.xls'

### Statistics - Estidisticas

Restart the counters

---

Reinicia los contadores

In [3]:
## Clear Up the statistics

## GLOBAL Variables
# total of words counted
word_sum = 0
# Freq of each letter
letter_frequency = defaultdict(lambda:0)
# Percentile of each letter's frequency
letter_freq_percent = defaultdict(lambda:0)
 

## Obtain list of usable words from 5 to 6 letters

Using pandas and matplotlib.

--- 

Utilizando pandas y matplotlib.



In [None]:
# Obtain the list of words from the original database
## SLOW Run carefully.

#load the sheets for 5 and 6 letters words as Pandas DataFrames
words5letters = pd.read_excel (wordlist_fileSHORT, sheet_name="5")
words6letters = pd.read_excel (wordlist_fileSHORT, sheet_name="6")

# Extract only the words and the lexyc frequency of words
w5l = pd.DataFrame(words5letters ,columns=["PALABRA","FRECUENC"])
w6l = pd.DataFrame(words6letters , columns=["PALABRA", "FRECUENC"])
w6l

### Counting the letters into the frequency database 

To build the statistics about how frequent each letter is.

In [None]:
## Obtain the full database
## SLOW Run carefully.


#load the sheets for 5 and 6 letters words as Pandas DataFrames
all_words = pd.DataFrame(columns=["PALABRA","FRECUENC"])


for i in range(3,10): # range from 3 to 9, inclusive
    # wordlist_fileSHORT
    sheety = pd.read_excel (wordlist_fileSHORT, sheet_name=f"{i}")
    all_words = all_words.append( pd.DataFrame(sheety ,columns=["PALABRA","FRECUENC"]) )
for i in range(10,17): # range from 10 to 16, inclusive
    # wordlist_fileLONG
    sheety = pd.read_excel (wordlist_fileLONG, sheet_name=f"{i}")
    all_words = all_words.append(pd.DataFrame(sheety ,columns=["PALABRA","FRECUENC"]) )


all_words


In [None]:
## Count the word into the stats D

def is_vowel(letter):
    """ returt if it's a vowel """
    return unidecode.unidecode(letter) in "aeiou"

def no_accent(letter):
    """ turn a letter into the no accent equivalent"""
    return unidecode.unidecode(letter) 

assert is_vowel("a") == True
assert is_vowel("á") == True
assert is_vowel("v") == False


assert no_accent("a") == "a"
assert no_accent("á") == "a"
assert no_accent("v") == "v"
    
def count_word(word, letter_frequency = letter_frequency, accent_dif = accent_dif):
    """
    Given a word, update the global stats dictionary.
    It considers if you want to distinguish accents or not.
    """
    global word_sum
    # don't count not numbers
    if type(word) == type(0.1):
        return None
    # Count the word to the total of words
    word_sum += 1
    # We might need to exclude accents, but not the ñ
    for letter in word:
        if not accent_dif: # we must NOT distinguish accents
            if is_vowel(letter):
                letter_frequency[no_accent(letter)] += 1
            else:
                letter_frequency[letter] += 1
        else:
            letter_frequency[letter] += 1


In [None]:
### DEBUG Error checking which words are missing on the new data set.

filename = "../data/ListaTodasPalabras.csv"
# Initialise some variables
d = defaultdict(lambda:0)
total = 0
palabras = []

with open(filename) as fh:
    # count the total words and the letters
    for word in fh.readlines():
        if word == "PALABRAS\n":
            continue # Ignore the header name
        total += 1
        palabras.append(word.strip())
        noacc = unidecode.unidecode(word)
        for letter in noacc.strip():
            d[letter] += 1


print("Letter frequency from least to most")
s = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}


for item,val in s.items():
    print(f"{item} : {val}")


def is_not_in(word, palabras=palabras):
    if word in palabras:
        return None
    else:
        return word
    
    
assert is_not_in("aba") == None
assert is_not_in("somethingrandom") == "somethingrandom"

################ Contando palabras de un CSV pegado a mano

print("Total words counted from CSV")
print(total)

# TODO: There are some issues with importing the excel into pandas. some are fake nan.
len(all_words) - len(palabras) # Equals 0. There are the same number of records, but some are "nan"


In [None]:
# DEBUG : Running it in one word (WARNING: State change)

p = all_words[3].PALABRA[15]
f = all_words[3].FRECUENC[15]

print(p)
print(f)

print("Stadisticas antes", word_sum )
print("Stadisticas antes", letter_frequency )
count_word(p)
print("Stadisticas despues", word_sum )
print("Stadisticas despues", letter_frequency )

In [None]:
# Running it on all the words
## Building the Freq Table. Run Once.

palabras_not_in = []

def build_letter_freq_table():
    """
    Build the table with the total stats of letter frequency
    """
    for idx, word in enumerate(all_words.PALABRA):
        # make each letter into the stats dir.
        try:
            count_word(word)
            if is_not_in(word):
                palabras_not_in.append(word)
        except TypeError as e:
            print(e)
            print(word)
            print(type(word))
            print("WTF")
    return None

build_letter_freq_table()
letter_frequency




In [None]:
# DEBUG: boilerplate to Running it on all the words

palabras_not_in = []

def loop_runner():
    """
    Build the table with the total stats of letter frequency
    """
    for idx, word in all_words.PALABRA.iteritems():
        # make each letter into the stats dir.
        try:
            if is_not_in(word):
                #palabras_not_in.append(word)
                try:
                    print("after",all_words.PALABRA.get(idx-1))
                    print("before",all_words.PALABRA.get(idx+1))
                except:
                    pass
        except TypeError as e:
            print(e)
            print(word)
            print(type(word))
            print("WTF")
    return None

loop_runner()
letter_frequency




In [None]:
# todelete: Another way to run stuff on all the words.
# Trying to get missing words.

for idx, word in all_words.PALABRA.iteritems():
    if word == "abejorreo":
        print("YAY")
    try:
        #print("after",all_words.PALABRA.get(idx-1))
    except TypeError as e:
        print(e)

#"anatomopatólogo"

In [None]:
# TMP Notes. How to merge 2 dataframes
df1 = palabras # tendria que ser un data frame.
df2 = all_words # all the words
merged = df1.merge(df2, indicator=True, how='outer')
merged[merged['_merge'] == 'right_only']

In [None]:
# RUN: Check stats
print("letter_frequency",letter_frequency)
print("word_sum",word_sum)
print("missing words", len(palabras_not_in))


In [None]:
# tenemos la tabla de frec. Necesitamos phi. Hay que calcular, el phi de una palabla.
# para el phi, necesitamos, el % de frequencias

#suma todas las frec de todas las letras
tot_letters = 0 # all the counted letters
for k,v in letter_frequency.items():
    tot_letters += v
    
print(tot_letters)


In [None]:
# Calculate the percentage of each letter's frequency

# letter_freq_percent = defaultdict(lambda:0) #reset it for testing
for letter,freq in letter_frequency.items():
    letter_freq_percent[letter] = (letter_frequency[letter] * 100 ) / tot_letters
    
print(letter_freq_percent)

# Validate that the percentile is correct
check = 0
for l,f in letter_freq_percent.items():
    check+=f
assert check == 100.0
    

In [None]:

def calculate_phi(word):
    """ Calculate Phi
    Sum the frequency of each letter of the word"""
    phi = 0
    for letter in word:
        phi += letter_freq_percent[letter]
    return phi

assert round(calculate_phi("aa"),4) == round(31.511069684740857,4)
assert calculate_phi("") == 0 
print( round(calculate_phi("busto"),4), round(23.90546695497452,4 ))
print( round(calculate_phi("casa"),4), round(23.90546695497452,4 ))


In [None]:
# splitting the words into lists of letters
splits = df.apply(lambda x: [y for y in x[0].strip()], axis='columns')
splits

## TXT


# WIP Do not read forward



In [None]:
# Stats of strokes per letter.
rasgos = {
        "a": 3,
        "b": 4,
        "c": 2,
        "d": 3,
        "e": 3,
        "f": 4,
        "g": 4,
        "h": 4,
        "i": 3,
        "j": 3,
        "k": 5,
        "l": 2,
        "m": 6,
        "n": 4,
        "o": 3,
        "p": 4,
        "q": 3,
        "r": 3,
        "s": 3,
        "t": 4,
        "u": 4,
        "v": 4,
        "w": 6,
        "x": 4,
        "y": 5,
        "z": 5
        }

In [None]:
## Trying to calculate Phi
## Sum of all the % Frequency of each letter in the word

def get_phi(letter):
    return 1

def calc_phi(word=None):
    """ Calculate the phi value of a word.
    Phi is the sum of the frequencies of the each letter in a word.
    """
    if word is None:
        return 0
    phi = 0
    for letter in word:
        phi += get_phi(letter)
    return phi



In [None]:
print(calc_phi("aa"), 199)
print(calc_phi(""),0)
print(calc_phi(),0)
print(calc_phi("abadia"))

In [None]:
## Trying to calculate Stroke sum RHO
## Sum of all the strokes of each letter in a word

def get_rho(letter):
    return rasgos[letter]

def calc_rho(word=None):
    """ Calculate the Rho value of a word.
    Rho is the sum of the strokes of the each letter in a word.
    """
    if word is None:
        return 0
    rho = 0
    for letter in word:
        rho += get_rho(letter)
    return rho



In [None]:
print(calc_rho("aa"), 6)
print(calc_rho(""),0)
print(calc_rho(),0)
print(calc_rho("peluca"),18)