# **Generating Spanish Proto-lexicon Stimuli**

In [2]:
import pandas as pd 
import re
import numpy as np

# **Step 1 — Dataset Identification**

**Identify Spanish words that are common to SPALEX and Wuggy, and put in a new file together with their freq (from SPALEX).**

In [2]:
spalex = pd.read_csv('spalex.csv') #store spalex data
spalex

Unnamed: 0,spelling,count_total,percent_total,prevalence_total,count_nts,percent_nts,prevalence_nts,count_ntl,percent_ntl,prevalence_ntl,freq,zipf
0,ababa,296,9.121622,-1.308782,157,12.101911,-1.151278,139,5.755396,-1.538375,0.107222,2.030284
1,ababol,285,17.894737,-0.907172,144,22.222222,-0.755415,141,13.475177,-1.087517,0.032492,1.511776
2,abacá,308,12.012987,-1.155570,166,12.048193,-1.153868,142,11.971831,-1.157563,0.042239,1.625714
3,abacería,312,41.025641,-0.224578,168,43.452381,-0.163205,144,38.194444,-0.297284,0.048737,1.687859
4,abacero,347,36.311239,-0.346506,184,43.478261,-0.162554,163,28.220859,-0.569859,0.035741,1.553167
...,...,...,...,...,...,...,...,...,...,...,...,...
44848,zurrón,282,79.787230,0.823519,133,93.984960,1.517669,149,67.114090,0.438338,0.435386,2.638874
44849,zurullo,326,69.325150,0.499593,175,96.000000,1.699633,151,38.410600,-0.291682,0.006498,0.812780
44850,zurumbático,350,22.571430,-0.743937,186,15.053760,-1.019292,164,31.097560,-0.487743,0.003249,0.511750
44851,zutana,271,37.638380,-0.311738,138,39.855070,-0.254473,133,35.338350,-0.372260,0.012997,1.113843


In [3]:
wuggy = pd.read_csv("wuggy.txt", sep=" ", header=None, names=["words"]) #store wuggy data
wuggy

Unnamed: 0,words
0,abad
1,abadejo
2,abadesa
3,abadía
4,abajo
...,...
31485,únicamente
31486,único
31487,útero
31488,útil


In [46]:
#create a new dataframe with common words from spalex and wuggy and their frequency values
common_words = spalex.merge(wuggy, left_on='spelling', right_on='words', how='inner')
common_words.drop(common_words.columns[1:10], axis=1, inplace=True)
common_words.drop(common_words.columns[2:4], axis=1, inplace=True)
common_words

Unnamed: 0,spelling,freq
0,ábaco,3.587065
1,abad,13.048597
2,abadejo,0.172205
3,abadesa,1.689559
4,abadía,7.739482
...,...,...
26053,zurriburri,0.006498
26054,zurrido,0.016246
26055,zurrón,0.435386
26056,zurullo,0.006498


In [5]:
common_words.to_csv("spalex_wuggy_common_words") #saves common words from wuggy and spalex as a csv file

# **Step 2 — English Filtering**

**Filter out Spanish words that are identical to English words (with or without accents).**

In [6]:
english = pd.read_csv("english_lexicon.txt", sep=" ", header=None, names=["words"]) #store english_lexicon data
english

Unnamed: 0,words
0,a
1,aachen
2,aalborg
3,aalborg
4,aardvark
...,...
119349,zulu
119350,zulus
119351,zurich
119352,zygote


In [7]:
#replace spanish accents with non-accented letters for merging purposes since wuggy does not output accents
filtered_spanish = common_words.copy()
replacements = {"á" : "a",
                "é" : "e",
                "í" : "i",
                "ó" : "o",
                "ú" : "u",
                "ü" : "u",
                "ñ" : "n"}
for index in range(0, len(filtered_spanish['spelling'])):
    new_string = ''
    for character in filtered_spanish['spelling'][index]:
        if character in replacements.keys():
            new_character = replacements[character]
            new_string += new_character
        else:
            new_string += character
        filtered_spanish['spelling'][index] = new_string 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_spanish['spelling'][index] = new_string


In [8]:
filtered_spanish['spanish'] = common_words['spelling'] #adds the actual spanish spellings to dataframe
filtered_spanish

Unnamed: 0,spelling,freq,spanish
0,abaco,3.587065,ábaco
1,abad,13.048597,abad
2,abadejo,0.172205,abadejo
3,abadesa,1.689559,abadesa
4,abadia,7.739482,abadía
...,...,...,...
26053,zurriburri,0.006498,zurriburri
26054,zurrido,0.016246,zurrido
26055,zurron,0.435386,zurrón
26056,zurullo,0.006498,zurullo


In [9]:
#creates a dataframe of spanish words not in the english cognate database
spanish_not_in_english = filtered_spanish[~filtered_spanish['spelling'].isin(english['words'])]
spanish_not_in_english.drop(spanish_not_in_english.columns[0], axis=1, inplace=True)
spanish_not_in_english 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spanish_not_in_english.drop(spanish_not_in_english.columns[0], axis=1, inplace=True)


Unnamed: 0,freq,spanish
0,3.587065,ábaco
1,13.048597,abad
2,0.172205,abadejo
3,1.689559,abadesa
4,7.739482,abadía
...,...,...
26053,0.006498,zurriburri
26054,0.016246,zurrido
26055,0.435386,zurrón
26056,0.006498,zurullo


In [10]:
#saves common words from spalex and wuggy that don't have english matches and their frequencies into a csv file
spanish_not_in_english.to_csv("common_spanish_words_not_in_english") 

In [11]:
#exports just the words to a csv file to be plugged in to wuggy
dataframe_for_wuggy = spanish_not_in_english['spanish']
dataframe_for_wuggy.to_csv("dataframe_for_wuggy", index=False, header=False)

# **Step 3 — Wuggy**

**Create 10 candidate nonwords for each word using Wuggy.**

# **Step 4 — Orthographic Length Filtering**

**Filter out nonword candidates with orthographic length mismatches to the word.**

In [12]:
wuggy_output = pd.read_csv("wuggy_output_csv.csv")
wuggy_output

Unnamed: 0,Word,Match
0,abaco,adiso
1,abaco,asiso
2,abaco,apiso
3,abaco,agiso
4,abaco,acasa
...,...,...
227262,zutano,nucaro
227263,zutano,nucaco
227264,zutano,nucato
227265,zutano,nuvaro


In [13]:
#without the ñ replacement
re_add_filtered_spanish = common_words.copy()
replacements = {"á" : "a",
                "é" : "e",
                "í" : "i",
                "ó" : "o",
                "ú" : "u",
                "ü" : "u"}
for index in range(0, len(re_add_filtered_spanish['spelling'])):
    new_string = ''
    for character in re_add_filtered_spanish['spelling'][index]:
        if character in replacements.keys():
            new_character = replacements[character]
            new_string += new_character
        else:
            new_string += character
        re_add_filtered_spanish['spelling'][index] = new_string   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  re_add_filtered_spanish['spelling'][index] = new_string


In [48]:
#adds the actual spanish words to the dataframe
re_add_filtered_spanish['spanish'] = common_words['spelling']
re_add_filtered_spanish

Unnamed: 0,spelling,freq,spanish
0,abaco,3.587065,ábaco
1,abad,13.048597,abad
2,abadejo,0.172205,abadejo
3,abadesa,1.689559,abadesa
4,abadia,7.739482,abadía
...,...,...,...
26053,zurriburri,0.006498,zurriburri
26054,zurrido,0.016246,zurrido
26055,zurron,0.435386,zurrón
26056,zurullo,0.006498,zurullo


In [15]:
new_wuggy_output = wuggy_output.copy()
replacements = {"xx" : "x",
                "Ã¼" : "ü"}
for index in range(0, len(new_wuggy_output['Word'])):
    new_string = ''
    for character in new_wuggy_output['Word'][index]:
        if character in replacements.keys():
            new_character = replacements[character]
            new_string += new_character
        else:
            new_string += character
        new_wuggy_output['Word'][index] = new_string
for index in range(0, len(new_wuggy_output['Match'])):
    new_string = ''
    for character in new_wuggy_output['Match'][index]:
        if character in replacements.keys():
            new_character = replacements[character]
            new_string += new_character
        else:
            new_string += character
        new_wuggy_output['Match'][index] = new_string  

In [32]:
new_wuggy_output

Unnamed: 0,Word,Match
0,abaco,adiso
1,abaco,asiso
2,abaco,apiso
3,abaco,agiso
4,abaco,acasa
...,...,...
227262,zutano,nucaro
227263,zutano,nucaco
227264,zutano,nucato
227265,zutano,nuvaro


In [33]:
#drop duplicates to avoid issues with joining since wuggy does not account for accents
new_wuggy_output = new_wuggy_output.drop_duplicates()
new_wuggy_output

Unnamed: 0,Word,Match
0,abaco,adiso
1,abaco,asiso
2,abaco,apiso
3,abaco,agiso
4,abaco,acasa
...,...,...
227262,zutano,nucaro
227263,zutano,nucaco
227264,zutano,nucato
227265,zutano,nuvaro


In [34]:
# replace "Ã¼" with "ü" and "xx" with "x" to fix wuggy generation errors
new_wuggy_output['Word'] = new_wuggy_output['Word'].apply(lambda x: x.replace("Ã¼", "ü").replace("xx", "x"))
new_wuggy_output['Match'] = new_wuggy_output['Match'].apply(lambda x: x.replace("Ã¼", "ü").replace("xx", "x"))
new_wuggy_output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_wuggy_output['Word'] = new_wuggy_output['Word'].apply(lambda x: x.replace("Ã¼", "ü").replace("xx", "x"))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_wuggy_output['Match'] = new_wuggy_output['Match'].apply(lambda x: x.replace("Ã¼", "ü").replace("xx", "x"))


Unnamed: 0,Word,Match
0,abaco,adiso
1,abaco,asiso
2,abaco,apiso
3,abaco,agiso
4,abaco,acasa
...,...,...
227262,zutano,nucaro
227263,zutano,nucaco
227264,zutano,nucato
227265,zutano,nuvaro


In [24]:
def remove_accents(word):
    """Removes accents from vowels and other special Spanish characters (ü and ñ) 
    from a word. Returns the word without accents in lowercase.
    """
    replacements = {
        "á": "a",
        "é": "e",
        "í": "i",
        "ó": "o",
        "ú": "u",
        "ü": "u"
    }

    word = word.lower()

    for (accent, no_accent) in replacements.items():
        word = word.replace(accent, no_accent)

    return word

In [25]:
spalex = pd.read_csv("spalex.csv")
spalex["spanish"] = spalex["spelling"].apply(remove_accents)
spalex[["spelling", "spanish"]]

Unnamed: 0,spelling,spanish
0,ababa,ababa
1,ababol,ababol
2,abacá,abaca
3,abacería,abaceria
4,abacero,abacero
...,...,...
44848,zurrón,zurron
44849,zurullo,zurullo
44850,zurumbático,zurumbatico
44851,zutana,zutana


In [42]:
merged_data = new_wuggy_output.merge(spalex, left_on='Word', right_on='spanish', how='inner')

In [43]:
merged_data = merged_data[['spelling', 'Match', 'spanish']]
merged_data

Unnamed: 0,spelling,Match,spanish
0,ábaco,adiso,abaco
1,ábaco,asiso,abaco
2,ábaco,apiso,abaco
3,ábaco,agiso,abaco
4,ábaco,acasa,abaco
...,...,...,...
227653,zutano,nucaro,zutano
227654,zutano,nucaco,zutano
227655,zutano,nucato,zutano
227656,zutano,nuvaro,zutano


In [38]:
new_wuggy_output.to_csv("new_wuggy_output.csv")

In [39]:
merged_data.to_csv("merged_data.csv")

In [44]:
merged_data.to_csv("merged_data.csv")

In [45]:
final_wuggy_spanish = merged_data.copy()
for index in range(0, (len(final_wuggy_spanish["Match"]))):
    if len(final_wuggy_spanish["Match"][index]) != len(final_wuggy_spanish["spanish"][index]):
        final_wuggy_spanish = final_wuggy_spanish.drop([index])
final_wuggy_spanish

Unnamed: 0,spelling,Match,spanish
0,ábaco,adiso,abaco
1,ábaco,asiso,abaco
2,ábaco,apiso,abaco
3,ábaco,agiso,abaco
4,ábaco,acasa,abaco
...,...,...,...
227653,zutano,nucaro,zutano
227654,zutano,nucaco,zutano
227655,zutano,nucato,zutano
227656,zutano,nuvaro,zutano


In [46]:
final_wuggy_spanish = final_wuggy_spanish.reset_index(drop=True)

In [47]:
final_wuggy_spanish

Unnamed: 0,spelling,Match,spanish
0,ábaco,adiso,abaco
1,ábaco,asiso,abaco
2,ábaco,apiso,abaco
3,ábaco,agiso,abaco
4,ábaco,acasa,abaco
...,...,...,...
225774,zutano,nucaro,zutano
225775,zutano,nucaco,zutano
225776,zutano,nucato,zutano
225777,zutano,nuvaro,zutano


In [48]:
final_wuggy_spanish.to_csv("result.csv")

# **Step 5 — Wuggy Output Accent Addition**

**Copy across accents on vowels from words to nonword candidates.**

In [49]:
final_wuggy_spanish

Unnamed: 0,spelling,Match,spanish
0,ábaco,adiso,abaco
1,ábaco,asiso,abaco
2,ábaco,apiso,abaco
3,ábaco,agiso,abaco
4,ábaco,acasa,abaco
...,...,...,...
225774,zutano,nucaro,zutano
225775,zutano,nucaco,zutano
225776,zutano,nucato,zutano
225777,zutano,nuvaro,zutano


In [50]:
def get_word_shape(word):
    """Returns the shape of the word, i.e. all consonants and vowels are converted
    into C and V, respectively.
    """
    word = word.lower()
    word = re.sub("[^aáeéiíoóuúü]", "C", word)
    word = re.sub("[aáeéiíoóuúü]", "V", word)
    return word

def get_accent_index(word):
    """Returns a tuple of the first accented character in the word, along with its 
    index. If the word has no accented characters, return the tuple (None, -1).
    """
    match = re.search("[áéíóúü]", word)

    if match is not None:
        return (match.group(0), match.start())
    else:
        return (None, -1)

def add_accent(vowel):
    """Returns a lowercase vowel with an accent."""
    if vowel == "a":
        return "á"
    elif vowel == "e":
        return "é"
    elif vowel == "i":
        return "í"
    elif vowel == "o":
        return "ó"
    elif vowel == "u":
        return "ú"

def copy_accent(real, pseudo):
    """Copies an accent from a word to a nonword. See 'Copying accents' section
    below for more details.
    """
    # get accent character and index
    accent_char = get_accent_index(real)[0]
    accent_idx = get_accent_index(real)[1]

    # CASE: return unchanged psuedoword if no accent
    if accent_idx == -1:
        return pseudo
    
    # get word shapes
    real_shape = get_word_shape(real)
    pseudo_shape = get_word_shape(pseudo)

    # CASE: same vowel index
    if pseudo_shape[accent_idx] == "V":

        # CASE: dotted u case
        if accent_char == "ü" and pseudo[accent_idx] == "u":
            accent = "ü"
        else:
            accent = add_accent(pseudo[accent_idx])

        # add accent in pseudoword
        pseudo_accent = pseudo[:accent_idx] + accent + pseudo[accent_idx + 1:]
        return pseudo_accent
    
    try:
        # find which vowel the accent occurs (i.e. 1st, 2nd, 3rd, ... vowel)
        vowel_idx = [pattern.start(0) for pattern in re.finditer("V", real_shape)].index(accent_idx)

        # corresponding vowel in pseudoword
        replace_idx = [pattern.start(0) for pattern in re.finditer("V", pseudo_shape)][vowel_idx]

        # add accent in pseudoword
        accent = add_accent(pseudo[replace_idx])
        pseudo_accent = pseudo[:replace_idx] + accent + pseudo[replace_idx + 1:]

        return pseudo_accent
    
    except IndexError:
        # CASE: specific to 'interviú' since the code above doesn't copy accents to psuedowords
        if real[-2:] == "iú":
            accent = add_accent(pseudo[-2])
            pseudo_accent = pseudo[:-2] + accent + pseudo[-1]
            return pseudo_accent
        
        # if all else fails, return the original pseudoword
        return pseudo

def copy_accent_df(input_path, acc_col, no_acc_col, output_path):
    """Copies accent from words to nonwords in a dataframe."""
    accent_df = pd.read_csv(input_path)
    accent_df["Match"] = accent_df.apply(lambda x: copy_accent(x[acc_col], x[no_acc_col]), axis=1)

    with open(output_path, "w", newline="") as out_file:
        accent_df.to_csv(out_file, index=False)
    
    return accent_df


In [53]:
copy_accent_df(input_path="result.csv",
               acc_col="spelling",
               no_acc_col="Match",
               output_path="result_accents.csv")

Unnamed: 0.1,Unnamed: 0,spelling,Match,spanish
0,0,ábaco,ádiso,abaco
1,1,ábaco,ásiso,abaco
2,2,ábaco,ápiso,abaco
3,3,ábaco,ágiso,abaco
4,4,ábaco,ácasa,abaco
...,...,...,...,...
225774,225774,zutano,nucaro,zutano
225775,225775,zutano,nucaco,zutano
225776,225776,zutano,nucato,zutano
225777,225777,zutano,nuvaro,zutano


In [56]:
final_wuggy_spanish = pd.read_csv('result_accents.csv')
final_wuggy_spanish = final_wuggy_spanish.drop('Unnamed: 0', axis = 1)
final_wuggy_spanish

Unnamed: 0,spelling,Match,spanish
0,ábaco,ádiso,abaco
1,ábaco,ásiso,abaco
2,ábaco,ápiso,abaco
3,ábaco,ágiso,abaco
4,ábaco,ácasa,abaco
...,...,...,...
225774,zutano,nucaro,zutano
225775,zutano,nucaco,zutano
225776,zutano,nucato,zutano
225777,zutano,nuvaro,zutano


In [57]:
final_wuggy_spanish.to_csv("result_accents.csv")

# **Step 6 — Phonlogical Conversion and Length Filtering**

**Convert words and nonwords to phonological forms and filter out nonwords with phonological length mismatches.**

In [58]:
final_wuggy_spanish['spanish'].to_csv('6-orth-real-words.txt', sep='\t', index=False, header=False)

In [59]:
final_wuggy_spanish['Match'].to_csv('6-orth-pseudowords.txt', sep='\t', index=False, header=False)

In [3]:
#reading real word phonological output from eSpeakNG
phon_real = pd.read_csv("6-phon-real-words.txt", header=None)
phon_real.rename(columns = {0: "phon_spanish"}, inplace = True)
phon_real = phon_real['phon_spanish'].str.replace(' ', '')
phon_real = pd.DataFrame(phon_real)
phon_real

Unnamed: 0,phon_spanish
0,aBako
1,aBako
2,aBako
3,aBako
4,aBako
...,...
225774,sutano
225775,sutano
225776,sutano
225777,sutano


In [4]:
#reading nonword candidate phonological output from eSpeakNG
phon_pseudo = pd.read_csv("6-phon-pseudowords.txt", header = None)
phon_pseudo.rename(columns = {0: "phon_match"}, inplace = True)
phon_pseudo = phon_pseudo['phon_match'].str.replace(' ', '')
phon_pseudo = pd.DataFrame(phon_pseudo)
phon_pseudo

Unnamed: 0,phon_match
0,aDiso
1,asiso
2,apiso
3,axiso
4,akasa
...,...
225774,nukaRo
225775,nukako
225776,nukato
225777,nuBaRo


In [8]:
final_wuggy_spanish = pd.read_csv('result_accents.csv')
final_wuggy_spanish = final_wuggy_spanish.drop("Unnamed: 0", axis = 1)
wsp = final_wuggy_spanish.copy()
wsp["phon_spanish"] = phon_real["phon_spanish"]
wsp["phon_match"] = phon_pseudo["phon_match"]
wsp = wsp.rename(columns={"Match":"match"})
wsp

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match
0,ábaco,ádiso,abaco,aBako,aDiso
1,ábaco,ásiso,abaco,aBako,asiso
2,ábaco,ápiso,abaco,aBako,apiso
3,ábaco,ágiso,abaco,aBako,axiso
4,ábaco,ácasa,abaco,aBako,akasa
...,...,...,...,...,...
225774,zutano,nucaro,zutano,sutano,nukaRo
225775,zutano,nucaco,zutano,sutano,nukako
225776,zutano,nucato,zutano,sutano,nukato
225777,zutano,nuvaro,zutano,sutano,nuBaRo


In [9]:
wsp.to_csv("wsp.csv")

In [10]:
#Step 6 continued
wsp = wsp[wsp["phon_spanish"].str.len() == wsp["phon_match"].str.len()]
wsp

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match
0,ábaco,ádiso,abaco,aBako,aDiso
1,ábaco,ásiso,abaco,aBako,asiso
2,ábaco,ápiso,abaco,aBako,apiso
3,ábaco,ágiso,abaco,aBako,axiso
4,ábaco,ácasa,abaco,aBako,akasa
...,...,...,...,...,...
225774,zutano,nucaro,zutano,sutano,nukaRo
225775,zutano,nucaco,zutano,sutano,nukako
225776,zutano,nucato,zutano,sutano,nukato
225777,zutano,nuvaro,zutano,sutano,nuBaRo


In [11]:
wsp = wsp.reset_index(drop=True)

In [12]:
wsp

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match
0,ábaco,ádiso,abaco,aBako,aDiso
1,ábaco,ásiso,abaco,aBako,asiso
2,ábaco,ápiso,abaco,aBako,apiso
3,ábaco,ágiso,abaco,aBako,axiso
4,ábaco,ácasa,abaco,aBako,akasa
...,...,...,...,...,...
182608,zutano,nucaro,zutano,sutano,nukaRo
182609,zutano,nucaco,zutano,sutano,nukako
182610,zutano,nucato,zutano,sutano,nukato
182611,zutano,nuvaro,zutano,sutano,nuBaRo


# **Step 7 — Phonological Shape Filtering**

**Filter out nonword candidates that don't have the same shape as the word (e.g. shape of velon = CVCVC; based on phonological forms).**

In [13]:
new_string = ""
match_list = []
spanish_list = []
error1 = ""
error2 = ""
index_error = ""
consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'ñ', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z',
              'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'Ñ', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z',]
vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'ü',
          'A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ü']

for index in range(0, len(wsp["phon_match"])):
    word = (wsp["phon_match"][index])
    if len(word) == 0:
        index_error = (index)
    for letter in word:
        if letter in consonants:
            new_string += "C"
        elif letter in vowels:
            new_string += "V"
        else: 
            error1 += letter
    match_list.append(new_string)
    new_string = ""
    
for index in range(0, len(wsp["phon_spanish"])):
    word = (wsp["phon_spanish"][index])
    if len(word) == 0:
        index_error = index
    for letter in word:
        if letter in consonants:
            new_string += "C"
        elif letter in vowels:
            new_string += "V"
        else: 
            error2 += letter
    spanish_list.append(new_string)
    new_string = ""

In [14]:
wsp["shape_phon_spanish"] = spanish_list
wsp["shape_phon_match"] = match_list
wsp

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match,shape_phon_spanish,shape_phon_match
0,ábaco,ádiso,abaco,aBako,aDiso,VCVCV,VCVCV
1,ábaco,ásiso,abaco,aBako,asiso,VCVCV,VCVCV
2,ábaco,ápiso,abaco,aBako,apiso,VCVCV,VCVCV
3,ábaco,ágiso,abaco,aBako,axiso,VCVCV,VCVCV
4,ábaco,ácasa,abaco,aBako,akasa,VCVCV,VCVCV
...,...,...,...,...,...,...,...
182608,zutano,nucaro,zutano,sutano,nukaRo,CVCVCV,CVCVCV
182609,zutano,nucaco,zutano,sutano,nukako,CVCVCV,CVCVCV
182610,zutano,nucato,zutano,sutano,nukato,CVCVCV,CVCVCV
182611,zutano,nuvaro,zutano,sutano,nuBaRo,CVCVCV,CVCVCV


In [15]:
wsp = wsp[wsp["shape_phon_spanish"] == wsp["shape_phon_match"]]
wsp

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match,shape_phon_spanish,shape_phon_match
0,ábaco,ádiso,abaco,aBako,aDiso,VCVCV,VCVCV
1,ábaco,ásiso,abaco,aBako,asiso,VCVCV,VCVCV
2,ábaco,ápiso,abaco,aBako,apiso,VCVCV,VCVCV
3,ábaco,ágiso,abaco,aBako,axiso,VCVCV,VCVCV
4,ábaco,ácasa,abaco,aBako,akasa,VCVCV,VCVCV
...,...,...,...,...,...,...,...
182608,zutano,nucaro,zutano,sutano,nukaRo,CVCVCV,CVCVCV
182609,zutano,nucaco,zutano,sutano,nukako,CVCVCV,CVCVCV
182610,zutano,nucato,zutano,sutano,nukato,CVCVCV,CVCVCV
182611,zutano,nuvaro,zutano,sutano,nuBaRo,CVCVCV,CVCVCV


In [16]:
wsp.to_csv("wsp.csv")

In [17]:
wsp = wsp.reset_index(drop=True)
wsp

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match,shape_phon_spanish,shape_phon_match
0,ábaco,ádiso,abaco,aBako,aDiso,VCVCV,VCVCV
1,ábaco,ásiso,abaco,aBako,asiso,VCVCV,VCVCV
2,ábaco,ápiso,abaco,aBako,apiso,VCVCV,VCVCV
3,ábaco,ágiso,abaco,aBako,axiso,VCVCV,VCVCV
4,ábaco,ácasa,abaco,aBako,akasa,VCVCV,VCVCV
...,...,...,...,...,...,...,...
167615,zutano,nucaro,zutano,sutano,nukaRo,CVCVCV,CVCVCV
167616,zutano,nucaco,zutano,sutano,nukako,CVCVCV,CVCVCV
167617,zutano,nucato,zutano,sutano,nukato,CVCVCV,CVCVCV
167618,zutano,nuvaro,zutano,sutano,nuBaRo,CVCVCV,CVCVCV


# **Step 8 — Nonword English Filtering**
**Filter out nonwords that are identical to English words (with or without accents).**

In [18]:
english = pd.read_csv('english_lexicon.txt', header = None)
english.columns = ['words']
english

Unnamed: 0,words
0,a
1,aachen
2,aalborg
3,aalborg
4,aardvark
...,...
119349,zulu
119350,zulus
119351,zurich
119352,zygote


In [138]:
def get_word_df(file, col_name="spelling"):
    """Takes a text file with a word on each line and returns the words in a 
    one-column dataframe.
    """
    with open(file) as f:
        words = [line.rstrip("\n") for line in f]

    words_df = pd.DataFrame(words, columns=[col_name])

    return words_df

def get_common_words(spalex_file, wuggy_file, output_path):
    """Reads both SPALEX and Wuggy files and outputs a CSV file with words 
    common to both files, as well as their frequencies.
    """
    spalex = pd.read_csv(spalex_file)
    wuggy = get_word_df(wuggy_file)

    common_words = spalex.merge(wuggy, on="spelling", how="inner")
    common_words = common_words[["spelling", "freq"]]

    with open(output_path, "w", newline="") as out_file:
        common_words.to_csv(out_file, index=False)

    return common_words

def filter_out_english(spanish_file, english_file, span_col, eng_col, output_path):
    """Filters out English words from a dataframe containing Spanish words, with or
    without accents.
    """
    span = pd.read_csv(spanish_file)
    eng = get_word_df(english_file)

    # remove accents from Spanish and English words
    span["no_accent"] = span[span_col].apply(remove_accents)
    eng["no_accent"] = eng[eng_col].apply(remove_accents)

    # left anti-join (matches in Spanish and English are removed)
    filtered_df = span[~span["no_accent"].isin(eng["no_accent"])]
    filtered_df = filtered_df.reset_index(drop=True)
    filtered_df = filtered_df.drop(["no_accent"], axis=1)

    with open(output_path, "w", newline="") as out_file:
        filtered_df.to_csv(out_file, index=False)

    return filtered_df

def remove_accents(word):
    """Removes accents from vowels and other special Spanish characters (ü and ñ) 
    from a word. Returns the word without accents in lowercase.
    """
    replacements = {
        "á": "a",
        "é": "e",
        "í": "i",
        "ó": "o",
        "ú": "u",
        "ü": "u"
    }

    word = word.lower()

    for (accent, no_accent) in replacements.items():
        word = word.replace(accent, no_accent)

    return word

def series_to_txt(input_path, col_name, output_path):
    """Converts a Series to a text file, with each row being left-justified."""
    series = pd.read_csv(input_path)[col_name]

    with open(output_path, "w", newline="") as out_file:
        series.to_csv(out_file, index=False, header=False)

In [20]:
# filter out English words from Spanish nonwords
filter_out_english(spanish_file="wsp.csv",
                   english_file="english_lexicon.txt",
                   span_col="match",
                   eng_col="spelling",
                   output_path="non-english-nonwords.csv")

Unnamed: 0.1,Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match,shape_phon_spanish,shape_phon_match
0,0,ábaco,ádiso,abaco,aBako,aDiso,VCVCV,VCVCV
1,1,ábaco,ásiso,abaco,aBako,asiso,VCVCV,VCVCV
2,2,ábaco,ápiso,abaco,aBako,apiso,VCVCV,VCVCV
3,3,ábaco,ágiso,abaco,aBako,axiso,VCVCV,VCVCV
4,4,ábaco,ácasa,abaco,aBako,akasa,VCVCV,VCVCV
...,...,...,...,...,...,...,...,...
166660,182608,zutano,nucaro,zutano,sutano,nukaRo,CVCVCV,CVCVCV
166661,182609,zutano,nucaco,zutano,sutano,nukako,CVCVCV,CVCVCV
166662,182610,zutano,nucato,zutano,sutano,nukato,CVCVCV,CVCVCV
166663,182611,zutano,nuvaro,zutano,sutano,nuBaRo,CVCVCV,CVCVCV


In [22]:
nenw = pd.read_csv('non-english-nonwords.csv')
nenw = nenw.drop('Unnamed: 0', axis = 1)
nenw

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match,shape_phon_spanish,shape_phon_match
0,ábaco,ádiso,abaco,aBako,aDiso,VCVCV,VCVCV
1,ábaco,ásiso,abaco,aBako,asiso,VCVCV,VCVCV
2,ábaco,ápiso,abaco,aBako,apiso,VCVCV,VCVCV
3,ábaco,ágiso,abaco,aBako,axiso,VCVCV,VCVCV
4,ábaco,ácasa,abaco,aBako,akasa,VCVCV,VCVCV
...,...,...,...,...,...,...,...
166660,zutano,nucaro,zutano,sutano,nukaRo,CVCVCV,CVCVCV
166661,zutano,nucaco,zutano,sutano,nukako,CVCVCV,CVCVCV
166662,zutano,nucato,zutano,sutano,nukato,CVCVCV,CVCVCV
166663,zutano,nuvaro,zutano,sutano,nuBaRo,CVCVCV,CVCVCV


In [23]:
wsp_test = nenw.copy()
wsp_test
wsp_test['phon_spanish'] = wsp_test['phon_spanish'].apply(lambda x: ' '.join(list(x)))
wsp_test['phon_match'] = wsp_test['phon_match'].apply(lambda x: ' '.join(list(x)))
wsp_test.to_csv("wsp.csv")

In [24]:
wsp_test

Unnamed: 0,spelling,match,spanish,phon_spanish,phon_match,shape_phon_spanish,shape_phon_match
0,ábaco,ádiso,abaco,a B a k o,a D i s o,VCVCV,VCVCV
1,ábaco,ásiso,abaco,a B a k o,a s i s o,VCVCV,VCVCV
2,ábaco,ápiso,abaco,a B a k o,a p i s o,VCVCV,VCVCV
3,ábaco,ágiso,abaco,a B a k o,a x i s o,VCVCV,VCVCV
4,ábaco,ácasa,abaco,a B a k o,a k a s a,VCVCV,VCVCV
...,...,...,...,...,...,...,...
166660,zutano,nucaro,zutano,s u t a n o,n u k a R o,CVCVCV,CVCVCV
166661,zutano,nucaco,zutano,s u t a n o,n u k a k o,CVCVCV,CVCVCV
166662,zutano,nucato,zutano,s u t a n o,n u k a t o,CVCVCV,CVCVCV
166663,zutano,nuvaro,zutano,s u t a n o,n u B a R o,CVCVCV,CVCVCV


In [25]:
wsp_test["phon_spanish"].to_csv('phon_spanish.txt', sep='\t', index=False, header=False)
wsp_test["phon_match"].to_csv('phon_match.txt', sep='\t', index=False, header=False)

# **Step 9 — Word-Based Phonotactic Scoring**
**Score words and candidate nonwords according to word-based phonotactics. Script `phonotactics/score-unparsed.sh` will give you a *logprob* for every word; *score* is `logprob / (length + 1)`, where *length* is based on phonological form. For training file, use `train/words.txt`.**

In [29]:
# read file with real words
real = wsp_test.loc[:, "spelling"]

# read file with real word scores
scored_real = pd.read_csv("words_spanish.csv").rename(
    columns={"item": "real", "logprob": "logprob_real"}
)

# combine into one dataframe
scored_real_words = pd.concat([scored_real, real], axis=1).iloc[:, [2, 0, 1]]

# calculate score = logprob / (length + 1)
scored_real_words["score_real"] = scored_real_words["logprob_real"] / (scored_real_words["real"].str.len() + 1)

###

# read file with pseudowords
pseudo = wsp_test.loc[:, "match"]

# read file with pseudoword scores
scored_pseudo = pd.read_csv("words_match.csv").rename(
    columns={"item": "pseudo", "logprob": "logprob_pseudo"}
)

# combine into one dataframe
scored_pseudo_words = pd.concat([scored_pseudo, pseudo], axis=1).iloc[:, [2, 0, 1]]

# calculate score = logprob / (length + 1)
scored_pseudo_words["score_pseudo"] = scored_pseudo_words["logprob_pseudo"] / (scored_pseudo_words["pseudo"].str.len() + 1)
scored_pseudo_words

# combine scores for real words and pseudowords in 1 dataframe
srilm_output = pd.concat([scored_real_words, scored_pseudo_words], axis=1)

#absolute value of differences of scores
srilm_output["score_diff"] = abs(srilm_output["score_real"] - srilm_output["score_pseudo"])

#output
srilm_output.to_csv("srilm_output", index=False)
srilm_output

Unnamed: 0,spelling,real,logprob_real,score_real,match,pseudo,logprob_pseudo,score_pseudo,score_diff
0,ábaco,aBako,-5.869267,-0.978211,ádiso,aDiso,-6.433591,-1.072265,0.094054
1,ábaco,aBako,-5.869267,-0.978211,ásiso,asiso,-6.316546,-1.052758,0.074547
2,ábaco,aBako,-5.869267,-0.978211,ápiso,apiso,-5.903620,-0.983937,0.005726
3,ábaco,aBako,-5.869267,-0.978211,ágiso,axiso,-6.554572,-1.092429,0.114218
4,ábaco,aBako,-5.869267,-0.978211,ácasa,akasa,-5.711278,-0.951880,0.026331
...,...,...,...,...,...,...,...,...,...
166660,zutano,sutano,-7.307310,-1.043901,nucaro,nukaRo,-7.979513,-1.139930,0.096029
166661,zutano,sutano,-7.307310,-1.043901,nucaco,nukako,-9.415088,-1.345013,0.301111
166662,zutano,sutano,-7.307310,-1.043901,nucato,nukato,-8.159762,-1.165680,0.121779
166663,zutano,sutano,-7.307310,-1.043901,nuvaro,nuBaRo,-8.505691,-1.215099,0.171197


# **Step 10 — Word-Based Phonotactic Score Filtering**

**Filter candidate nonwords according to word-based phonotactic score mismatches; filter out if the absolute difference in scores is > 0.2.**

In [30]:
filtered_srilm = srilm_output[srilm_output["score_diff"] <= 0.2]
filtered_srilm.to_csv('filtered_srilm')
filtered_srilm

Unnamed: 0,spelling,real,logprob_real,score_real,match,pseudo,logprob_pseudo,score_pseudo,score_diff
0,ábaco,aBako,-5.869267,-0.978211,ádiso,aDiso,-6.433591,-1.072265,0.094054
1,ábaco,aBako,-5.869267,-0.978211,ásiso,asiso,-6.316546,-1.052758,0.074547
2,ábaco,aBako,-5.869267,-0.978211,ápiso,apiso,-5.903620,-0.983937,0.005726
3,ábaco,aBako,-5.869267,-0.978211,ágiso,axiso,-6.554572,-1.092429,0.114218
4,ábaco,aBako,-5.869267,-0.978211,ácasa,akasa,-5.711278,-0.951880,0.026331
...,...,...,...,...,...,...,...,...,...
166658,zutano,sutano,-7.307310,-1.043901,nucino,nusino,-8.017068,-1.145295,0.101394
166659,zutano,sutano,-7.307310,-1.043901,jucino,xusino,-6.665778,-0.952254,0.091647
166660,zutano,sutano,-7.307310,-1.043901,nucaro,nukaRo,-7.979513,-1.139930,0.096029
166662,zutano,sutano,-7.307310,-1.043901,nucato,nukato,-8.159762,-1.165680,0.121779


# **Step 11 — Morfessor**

**Use morfessor in Terminal from `morfessor` dir via: `python morfessor -t train.txt -S segmented.txt --randseed 123 --finish-threshold 0.00005 --progressbar`; convert to a list of unique morphs that occur in any words, and format so that one morph is on each line, with spaces between phonemes (like `phonotactics/train/words.txt`)**

# **Step 12 — Morph-Based Phonotactic Scoring**

**Score candidate words and nonwords according to morph-based phonotactics (`phontactics/score-parsed.sh`); `score = logprob / (length + 1)` just like Step 9.**

In [32]:
def get_unique_morphs(input_path, output_path):
    """Takes in a file of words separated by morphs (outputted by Morfessor) and
    returns a .txt file containing each unique morph on its own line, with spaces
    between phonemes.
    """
    # get unique morphs
    with open(input_path) as in_file:
        next(in_file) # skip first line
        morphs = [line.strip("1 ").strip("\n").split(" + ") for line in in_file]
        unique_morphs = {item for m in morphs for item in m}

    # space between each phonemes
    with open(output_path, "w") as out_file:
        morphs_list = list(unique_morphs)
        morphs_sep = [" ".join(list(morph)) for morph in morphs_list]
        out_file.write("\n".join(morphs_sep))
        
# get unique morphs to use as the training file in the next step
get_unique_morphs(input_path="segmented.txt", 
                  output_path="segmented-unique.txt")

In [32]:
filtered_srilm

Unnamed: 0,spelling,real,logprob_real,score_real,match,pseudo,logprob_pseudo,score_pseudo,score_diff
0,ábaco,aBako,-5.869267,-0.978211,ádiso,aDiso,-6.433591,-1.072265,0.094054
1,ábaco,aBako,-5.869267,-0.978211,ásiso,asiso,-6.316546,-1.052758,0.074547
2,ábaco,aBako,-5.869267,-0.978211,ápiso,apiso,-5.903620,-0.983937,0.005726
3,ábaco,aBako,-5.869267,-0.978211,ágiso,axiso,-6.554572,-1.092429,0.114218
4,ábaco,aBako,-5.869267,-0.978211,ácasa,akasa,-5.711278,-0.951880,0.026331
...,...,...,...,...,...,...,...,...,...
166658,zutano,sutano,-7.307310,-1.043901,nucino,nusino,-8.017068,-1.145295,0.101394
166659,zutano,sutano,-7.307310,-1.043901,jucino,xusino,-6.665778,-0.952254,0.091647
166660,zutano,sutano,-7.307310,-1.043901,nucaro,nukaRo,-7.979513,-1.139930,0.096029
166662,zutano,sutano,-7.307310,-1.043901,nucato,nukato,-8.159762,-1.165680,0.121779


In [239]:
unique_phon_spanish = "+" + filtered_srilm['real'].drop_duplicates() + "+"
unique_phon_spanish = unique_phon_spanish.apply(list).str.join(" ")
unique_phon_spanish.to_csv("unique_phon_spanish.txt", index=False, header=False)
unique_phon_spanish

0             + a B a k o +
10        + a B a D e x o +
17        + a B a D e s a +
26          + a B a D i a +
27            + a B a x o +
                ...        
104310        + s u R e o +
104311          + s u r a +
104313        + s u r o n +
104317      + s u R u j o +
104319      + s u t a n o +
Name: real, Length: 18657, dtype: object

In [241]:
unique_phon_match = "+" + filtered_srilm['pseudo'].drop_duplicates() + "+"
unique_phon_match = unique_phon_match.apply(list).str.join(" ")
unique_phon_match.to_csv("unique_phon_match.txt", index=False, header=False)
unique_phon_match

0           + a D i s o +
1           + a s i s o +
2           + a p i s o +
3           + a x i s o +
4           + a k a s a +
               ...       
104314      + k u j o n +
104317    + n u k u r o +
104318    + n u k u C o +
104320    + x u s i n o +
104323    + n u B a R o +
Name: pseudo, Length: 88297, dtype: object

In [35]:
filtered_srilm

Unnamed: 0,spelling,real,logprob_real,score_real,match,pseudo,logprob_pseudo,score_pseudo,score_diff
0,ábaco,aBako,-5.869267,-0.978211,ádiso,aDiso,-6.433591,-1.072265,0.094054
1,ábaco,aBako,-5.869267,-0.978211,ásiso,asiso,-6.316546,-1.052758,0.074547
2,ábaco,aBako,-5.869267,-0.978211,ápiso,apiso,-5.903620,-0.983937,0.005726
3,ábaco,aBako,-5.869267,-0.978211,ágiso,axiso,-6.554572,-1.092429,0.114218
4,ábaco,aBako,-5.869267,-0.978211,ácasa,akasa,-5.711278,-0.951880,0.026331
...,...,...,...,...,...,...,...,...,...
166658,zutano,sutano,-7.307310,-1.043901,nucino,nusino,-8.017068,-1.145295,0.101394
166659,zutano,sutano,-7.307310,-1.043901,jucino,xusino,-6.665778,-0.952254,0.091647
166660,zutano,sutano,-7.307310,-1.043901,nucaro,nukaRo,-7.979513,-1.139930,0.096029
166662,zutano,sutano,-7.307310,-1.043901,nucato,nukato,-8.159762,-1.165680,0.121779


In [276]:
morph_fs = filtered_srilm.copy()
morph_spanish = pd.read_csv('morph_spanish.csv')
morph_match = pd.read_csv('morph_match.csv')
morph_fs = morph_fs.drop('logprob_real', axis = 1)
morph_fs = morph_fs.drop('logprob_pseudo', axis = 1)
morph_fs = morph_fs.drop('score_real', axis = 1)
morph_fs = morph_fs.drop('score_pseudo', axis = 1)
morph_fs = morph_fs.drop('score_diff', axis = 1)
morph_fs

Unnamed: 0,spelling,real,match,pseudo
0,ábaco,aBako,ádiso,aDiso
1,ábaco,aBako,ásiso,asiso
2,ábaco,aBako,ápiso,apiso
3,ábaco,aBako,ágiso,axiso
4,ábaco,aBako,ácasa,akasa
...,...,...,...,...
104319,zutano,sutano,nucino,nusino
104320,zutano,sutano,jucino,xusino
104321,zutano,sutano,nucaro,nukaRo
104322,zutano,sutano,nucato,nukato


In [246]:
morph_fs = morph_fs.merge(morph_spanish,
                          how = "left", 
                          left_on = "real",
                          right_on = "item").merge(morph_match,
                                                   how = "left",
                                                   left_on = "pseudo",
                                                   right_on = "item")
morph_fs

Unnamed: 0,spelling,real,match,pseudo,item_x,logprob_x,item_y,logprob_y
0,ábaco,aBako,ádiso,aDiso,aBako,-4.934408,aDiso,-5.686987
1,ábaco,aBako,ásiso,asiso,aBako,-4.934408,asiso,-5.467200
2,ábaco,aBako,ápiso,apiso,aBako,-4.934408,apiso,-5.591964
3,ábaco,aBako,ágiso,axiso,aBako,-4.934408,axiso,-6.170054
4,ábaco,aBako,ácasa,akasa,aBako,-4.934408,akasa,-5.000271
...,...,...,...,...,...,...,...,...
104319,zutano,sutano,nucino,nusino,sutano,-6.574475,nusino,-6.763511
104320,zutano,sutano,jucino,xusino,sutano,-6.574475,xusino,-6.105624
104321,zutano,sutano,nucaro,nukaRo,sutano,-6.574475,nukaRo,-7.817013
104322,zutano,sutano,nucato,nukato,sutano,-6.574475,nukato,-7.358397


In [247]:
#new dataframe to do scoring and filtering
step_12_13 = morph_fs.copy()

#calculate score
step_12_13["score_spanish"] = step_12_13["logprob_x"] / (step_12_13["real"].str.len() + 1)
step_12_13["score_match"] = step_12_13["logprob_y"] / (step_12_13["pseudo"].str.len() + 1)
step_12_13 = step_12_13.reindex(columns=['spelling', 'real', 'logprob_x', 'score_spanish',
                                             'match', 'pseudo', 'logprob_y', 'score_match'])
#absolute value of differences of scores
step_12_13["morph_score_diff"] = abs(step_12_13["score_spanish"] - step_12_13["score_match"])
step_12_13.to_csv('step_12_13')
step_12_13

Unnamed: 0,spelling,real,logprob_x,score_spanish,match,pseudo,logprob_y,score_match,morph_score_diff
0,ábaco,aBako,-4.934408,-0.822401,ádiso,aDiso,-5.686987,-0.947831,0.125430
1,ábaco,aBako,-4.934408,-0.822401,ásiso,asiso,-5.467200,-0.911200,0.088799
2,ábaco,aBako,-4.934408,-0.822401,ápiso,apiso,-5.591964,-0.931994,0.109593
3,ábaco,aBako,-4.934408,-0.822401,ágiso,axiso,-6.170054,-1.028342,0.205941
4,ábaco,aBako,-4.934408,-0.822401,ácasa,akasa,-5.000271,-0.833379,0.010977
...,...,...,...,...,...,...,...,...,...
104319,zutano,sutano,-6.574475,-0.939211,nucino,nusino,-6.763511,-0.966216,0.027005
104320,zutano,sutano,-6.574475,-0.939211,jucino,xusino,-6.105624,-0.872232,0.066979
104321,zutano,sutano,-6.574475,-0.939211,nucaro,nukaRo,-7.817013,-1.116716,0.177505
104322,zutano,sutano,-6.574475,-0.939211,nucato,nukato,-7.358397,-1.051200,0.111989


# **Step 13 — Morph-Based Phonotactic Score Filtering**

**Filter candidate nonwords according to morph-based phonotactics, following the same criteria as in Step 10.**

In [248]:
final_filtered_srilm = step_12_13[step_12_13["morph_score_diff"] <= 0.2]
final_filtered_srilm = final_filtered_srilm.drop('logprob_x', axis = 1)
final_filtered_srilm = final_filtered_srilm.drop('logprob_y', axis = 1)
final_filtered_srilm = final_filtered_srilm.reset_index()
final_filtered_srilm = final_filtered_srilm.drop('index', axis = 1)

#add frequencies
final_filtered_srilm = final_filtered_srilm.merge(spalex[['freq', 'spelling']], how = 'left', left_on = 'spelling', right_on = 'spelling')
final_filtered_srilm.to_csv('final_filtered_srilm.csv', index = False)
final_filtered_srilm

Unnamed: 0,spelling,real,score_spanish,match,pseudo,score_match,morph_score_diff,freq
0,ábaco,aBako,-0.822401,ádiso,aDiso,-0.947831,0.125430,3.587065
1,ábaco,aBako,-0.822401,ásiso,asiso,-0.911200,0.088799,3.587065
2,ábaco,aBako,-0.822401,ápiso,apiso,-0.931994,0.109593,3.587065
3,ábaco,aBako,-0.822401,ácasa,akasa,-0.833379,0.010977,3.587065
4,ábaco,aBako,-0.822401,ácala,akala,-0.858517,0.036116,3.587065
...,...,...,...,...,...,...,...,...
101601,zurullo,suRujo,-1.021517,nucucho,nukuCo,-1.081825,0.060309,0.006498
101602,zutano,sutano,-0.939211,nucino,nusino,-0.966216,0.027005,0.123468
101603,zutano,sutano,-0.939211,jucino,xusino,-0.872232,0.066979,0.123468
101604,zutano,sutano,-0.939211,nucaro,nukaRo,-1.116716,0.177505,0.123468


# **Step 14 — Strip Accents**

**Strip accents from written forms of words and nonwords. Add spacing between characters of words and nonwords for future scoring.**

In [249]:
next_steps = final_filtered_srilm.copy()
next_steps['spelling_spaced'] = next_steps['spelling'].apply(remove_accents)
next_steps['match_spaced'] = next_steps['match'].apply(remove_accents)
next_steps['spelling_spaced'] = next_steps['spelling_spaced'].apply(lambda x: ' '.join(list(x)))
next_steps['match_spaced'] = next_steps['match_spaced'].apply(lambda x: ' '.join(list(x)))
next_steps.to_csv('next_steps.csv', index = False)
next_steps

Unnamed: 0,spelling,real,score_spanish,match,pseudo,score_match,morph_score_diff,freq,spelling_spaced,match_spaced
0,ábaco,aBako,-0.822401,ádiso,aDiso,-0.947831,0.125430,3.587065,a b a c o,a d i s o
1,ábaco,aBako,-0.822401,ásiso,asiso,-0.911200,0.088799,3.587065,a b a c o,a s i s o
2,ábaco,aBako,-0.822401,ápiso,apiso,-0.931994,0.109593,3.587065,a b a c o,a p i s o
3,ábaco,aBako,-0.822401,ácasa,akasa,-0.833379,0.010977,3.587065,a b a c o,a c a s a
4,ábaco,aBako,-0.822401,ácala,akala,-0.858517,0.036116,3.587065,a b a c o,a c a l a
...,...,...,...,...,...,...,...,...,...,...
101601,zurullo,suRujo,-1.021517,nucucho,nukuCo,-1.081825,0.060309,0.006498,z u r u l l o,n u c u c h o
101602,zutano,sutano,-0.939211,nucino,nusino,-0.966216,0.027005,0.123468,z u t a n o,n u c i n o
101603,zutano,sutano,-0.939211,jucino,xusino,-0.872232,0.066979,0.123468,z u t a n o,j u c i n o
101604,zutano,sutano,-0.939211,nucaro,nukaRo,-1.116716,0.177505,0.123468,z u t a n o,n u c a r o


# **Step 15 — Training Files**

**From the file `data/subtlex-esp_words-and-counts.txt`, take the first column, remove accents, and save this as a training file: one version with spaces between each character and one version without those spaces (both versions will have one word per line). The version with spaces will be replacing the training file in the phonotactics folder, and the version without will be replacing the training file in the morfessor folder.**

In [250]:
subtlex = pd.read_csv("subtlex-esp_words-and-counts.txt", sep = "\t")
subtlex['word_no_accents'] = subtlex["word"].apply(remove_accents)
subtlex['word_no_accents_spaced'] = subtlex['word_no_accents'].apply(lambda x: ' '.join(list(x)))
subtlex.to_csv("subtlex.csv", index = False)
subtlex['word'].to_csv("15_word.txt", index = False, header = False)
subtlex['phones'].to_csv("15_phones.txt", index = False, header = False)
subtlex['word_no_accents_spaced'].to_csv("new_phonotactics_training.txt", index = False, header = False)
subtlex['word_no_accents'].to_csv("new_morfessor_training.txt", index = False, header = False)
subtlex

Unnamed: 0,word,phones,count,word_no_accents,word_no_accents_spaced
0,a,a,965735.0,a,a
1,aarón,aaRon,1040.0,aaron,a a r o n
2,ábaco,aBako,9.0,abaco,a b a c o
3,abad,aBad,48.0,abad,a b a d
4,abadesa,aBaDesa,12.0,abadesa,a b a d e s a
...,...,...,...,...,...
93772,zurro,suro,4.0,zurro,z u r r o
93773,zurró,suro,2.0,zurro,z u r r o
93774,zurrón,suron,2.0,zurron,z u r r o n
93775,zurullo,suRujo,6.0,zurullo,z u r u l l o


# **Step 16 — More Phonotactic Scoring**

**Repeat all the steps about phonotactic scoring (word-based and morph-based) but use these new training files and new words / nonwords (in written not phonological form) to get the scores. These are orthotactic scores; save them to new columns in your dataframe and filter out word-nonword pairs that have large differences in scores.**

## Word-Based Scoring

In [278]:
#real spanish words
step_16_spanish_phono = pd.read_csv("step_16_spanish_phono.csv")
step_16_spanish_phono.columns = ['spanish_phono_strip', 'logprob_spanish_phono']
master_df_spanish_phono = pd.concat([step_16_spanish_phono, next_steps['spelling']], axis = 1).iloc[:, [2, 0, 1]]
master_df_spanish_phono['score_spanish_phono'] = master_df_spanish_phono['logprob_spanish_phono'] / (master_df_spanish_phono["spanish_phono_strip"].str.len() + 1) 
master_df_spanish_phono

Unnamed: 0,spelling,spanish_phono_strip,logprob_spanish_phono,score_spanish_phono
0,ábaco,abaco,-6.118689,-1.019781
1,ábaco,abaco,-6.118689,-1.019781
2,ábaco,abaco,-6.118689,-1.019781
3,ábaco,abaco,-6.118689,-1.019781
4,ábaco,abaco,-6.118689,-1.019781
...,...,...,...,...
101601,zurullo,zurullo,-10.173600,-1.271700
101602,zutano,zutano,-9.314816,-1.330688
101603,zutano,zutano,-9.314816,-1.330688
101604,zutano,zutano,-9.314816,-1.330688


In [280]:
#pseudo-match spanish words
step_16_match_phono = pd.read_csv("step_16_match_phono.csv")
step_16_match_phono.columns = ['match_phono_strip', 'logprob_match_phono']
master_df_match_phono = pd.concat([step_16_match_phono, next_steps['match']], axis = 1).iloc[:, [2, 0, 1]]
master_df_match_phono['score_match_phono'] = master_df_match_phono['logprob_match_phono'] / (master_df_match_phono["match_phono_strip"].str.len() + 1) 
master_df_match_phono

Unnamed: 0,match,match_phono_strip,logprob_match_phono,score_match_phono
0,ádiso,adiso,-6.378505,-1.063084
1,ásiso,asiso,-6.908798,-1.151466
2,ápiso,apiso,-6.241125,-1.040188
3,ácasa,acasa,-6.110122,-1.018354
4,ácala,acala,-5.241534,-0.873589
...,...,...,...,...
101601,nucucho,nucucho,-9.765124,-1.220641
101602,nucino,nucino,-8.245290,-1.177899
101603,jucino,jucino,-9.070348,-1.295764
101604,nucaro,nucaro,-8.258586,-1.179798


In [281]:
#combine spanish and pseudo-match words data
merged_phono = pd.concat([master_df_spanish_phono, master_df_match_phono], axis=1)

#score difference calculation and filtering out scores > |0.2|
merged_phono['phono_score_difference'] = merged_phono["score_spanish_phono"] - merged_phono['score_match_phono']
merged_phono = merged_phono[(merged_phono['phono_score_difference'] <= 0.2) & (merged_phono['phono_score_difference'] >= -0.2)]
merged_phono['phono_score_difference'] = np.abs(merged_phono['phono_score_difference'])
#save and show data
merged_phono.to_csv("merged_phono.csv", index=False)
merged_phono

Unnamed: 0,spelling,spanish_phono_strip,logprob_spanish_phono,score_spanish_phono,match,match_phono_strip,logprob_match_phono,score_match_phono,phono_score_difference
0,ábaco,abaco,-6.118689,-1.019781,ádiso,adiso,-6.378505,-1.063084,0.043303
1,ábaco,abaco,-6.118689,-1.019781,ásiso,asiso,-6.908798,-1.151466,0.131685
2,ábaco,abaco,-6.118689,-1.019781,ápiso,apiso,-6.241125,-1.040188,0.020406
3,ábaco,abaco,-6.118689,-1.019781,ácasa,acasa,-6.110122,-1.018354,0.001428
4,ábaco,abaco,-6.118689,-1.019781,ácala,acala,-5.241534,-0.873589,0.146192
...,...,...,...,...,...,...,...,...,...
101601,zurullo,zurullo,-10.173600,-1.271700,nucucho,nucucho,-9.765124,-1.220641,0.051060
101602,zutano,zutano,-9.314816,-1.330688,nucino,nucino,-8.245290,-1.177899,0.152789
101603,zutano,zutano,-9.314816,-1.330688,jucino,jucino,-9.070348,-1.295764,0.034924
101604,zutano,zutano,-9.314816,-1.330688,nucaro,nucaro,-8.258586,-1.179798,0.150890


## Morph-Based Scoring

In [282]:
#get unique morphs for training data
get_unique_morphs(input_path="step_16_segmented.txt", 
                  output_path="step_16_segmented_unique.txt")

In [283]:
#real spanish words unique morphs
unique_spanish = "+" + merged_phono["spanish_phono_strip"].drop_duplicates() + "+"
unique_spanish = unique_spanish.apply(list).str.join(" ")
unique_spanish.to_csv("step_16_real_spanish_morphs.txt", index=False, header=False)
unique_spanish

0             + a b a c o +
9         + a b a d e j o +
17        + a b a d e s a +
25          + a b a d i a +
26            + a b a j o +
                ...        
101589        + z u r d a +
101591        + z u r d o +
101593        + z u r e o +
101600    + z u r u l l o +
101602      + z u t a n o +
Name: spanish_phono_strip, Length: 17617, dtype: object

In [284]:
#pseudo-match words unique morphs
unique_match = "+" + merged_phono["match_phono_strip"].drop_duplicates() + "+"
unique_match = unique_match.apply(list).str.join(" ")
unique_match.to_csv("step_16_match_spanish_morphs.txt", index=False, header=False)
unique_match

0             + a d i s o +
1             + a s i s o +
2             + a p i s o +
3             + a c a s a +
4             + a c a l a +
                ...        
101591        + z u s c o +
101600    + n u c u r r o +
101601    + n u c u c h o +
101603      + j u c i n o +
101604      + n u c a r o +
Name: match_phono_strip, Length: 79202, dtype: object

In [285]:
step_16_spanish_morph = pd.read_csv("step_16_spanish_morph.csv")
step_16_spanish_morph

Unnamed: 0,item,logprob
0,abaco,-5.359334
1,abadejo,-7.873946
2,abadesa,-6.870398
3,abadia,-6.288518
4,abajo,-5.309474
...,...,...
17612,zurda,-6.649288
17613,zurdo,-6.696099
17614,zureo,-6.863608
17615,zurullo,-9.408513


In [286]:
step_16_match_morph = pd.read_csv("step_16_match_morph.csv")
step_16_match_morph

Unnamed: 0,item,logprob
0,adiso,-5.612483
1,asiso,-5.832738
2,apiso,-5.648479
3,acasa,-5.241584
4,acala,-5.091817
...,...,...
79197,zusco,-6.939099
79198,nucurro,-8.285841
79199,nucucho,-8.859136
79200,jucino,-7.345304


In [287]:
important_merged_phono = merged_phono.loc[:, ["spelling", "spanish_phono_strip", "match", "match_phono_strip"]]

#merge word-based and morphh-based data
database = important_merged_phono.merge(
    step_16_spanish_morph, 
    how="left", 
    left_on="spanish_phono_strip", 
    right_on="item"
).merge(
    step_16_match_morph,
    how="left",
    left_on="match_phono_strip",
    right_on="item"
)

database.columns = ["spanish", "spanish_strip", "match", "match_strip", "item_x", "logprob_spanish", "item_y", "logprob_match"]
database = database.drop("item_x", axis = 1)
database = database.drop("item_y", axis = 1)

#score calculations and filtering
database['score_spanish_morph'] = database['logprob_spanish'] / (database['spanish_strip'].str.len() + 1)
database['score_match_morph'] = database['logprob_match'] / (database['match'].str.len() + 1)
database['score_diff_morph'] = np.abs(database['score_spanish_morph'] - database['score_match_morph'])
database = database[database['score_diff_morph'] <= 0.2]

database = database.iloc[:, [0, 1, 4, 6, 2, 3, 5, 7, 8]]

database.to_csv("database.csv", index=False)
database

Unnamed: 0,spanish,spanish_strip,logprob_spanish,score_spanish_morph,match,match_strip,logprob_match,score_match_morph,score_diff_morph
0,ábaco,abaco,-5.359334,-0.893222,ádiso,adiso,-5.612483,-0.935414,0.042192
1,ábaco,abaco,-5.359334,-0.893222,ásiso,asiso,-5.832738,-0.972123,0.078901
2,ábaco,abaco,-5.359334,-0.893222,ápiso,apiso,-5.648479,-0.941413,0.048191
3,ábaco,abaco,-5.359334,-0.893222,ácasa,acasa,-5.241584,-0.873597,0.019625
4,ábaco,abaco,-5.359334,-0.893222,ácala,acala,-5.091817,-0.848636,0.044586
...,...,...,...,...,...,...,...,...,...
87149,zurullo,zurullo,-9.408513,-1.176064,nucucho,nucucho,-8.859136,-1.107392,0.068672
87150,zutano,zutano,-7.743720,-1.106246,nucino,nucino,-7.085373,-1.012196,0.094050
87151,zutano,zutano,-7.743720,-1.106246,jucino,jucino,-7.345304,-1.049329,0.056917
87152,zutano,zutano,-7.743720,-1.106246,nucaro,nucaro,-7.735436,-1.105062,0.001183


In [288]:
frequencies = spalex.loc[:, ['spelling', 'freq']]
database_numbers = database.loc[:, ["spanish", "match", "score_spanish_morph", "score_match_morph", "score_diff_morph"]].merge(
    frequencies, how = "left", left_on = "spanish", right_on = "spelling")
database_numbers = database_numbers.drop("spelling", axis = 1)
database_numbers = database_numbers.rename(columns={'spanish' : 'spelling'})
database_numbers.to_csv("database_numbers.csv", index=False)
database_numbers

Unnamed: 0,spelling,match,score_spanish_morph,score_match_morph,score_diff_morph,freq
0,ábaco,ádiso,-0.893222,-0.935414,0.042192,3.587065
1,ábaco,ásiso,-0.893222,-0.972123,0.078901,3.587065
2,ábaco,ápiso,-0.893222,-0.941413,0.048191,3.587065
3,ábaco,ácasa,-0.893222,-0.873597,0.019625,3.587065
4,ábaco,ácala,-0.893222,-0.848636,0.044586,3.587065
...,...,...,...,...,...,...
86105,zurullo,nucucho,-1.176064,-1.107392,0.068672,0.006498
86106,zutano,nucino,-1.106246,-1.012196,0.094050,0.123468
86107,zutano,jucino,-1.106246,-1.049329,0.056917,0.123468
86108,zutano,nucaro,-1.106246,-1.105062,0.001183,0.123468


# Step 17 — Categorize Words Into Bins

**There will be 12 bins. Each bin will be for words of length 5, 6, 7, and 8 intersected with the frequency per million — <10, 10-100, and >100 — extracted from ``spalex`` data.**

In [289]:
phono_bins = database_numbers.copy()
phono_bins = phono_bins.merge(filtered_srilm, on=['spelling', 'match'], how = "left")
phono_bins = phono_bins.drop(['logprob_real', 'logprob_pseudo', 'score_spanish_morph', 'score_match_morph', 'score_diff_morph'], axis = 1)
phono_bins = phono_bins.loc[:, ["spelling", "match", "freq", "real", "pseudo", "score_real", "score_pseudo", "score_diff"]]
phono_bins['frequency_bin'] = np.where(phono_bins['freq'] < 10, "<10",
                              np.where(phono_bins['freq'] > 100, ">100",
                              "10-100"))
phono_bins['length'] = phono_bins["real"].str.len()

#filter out words containing less than 5 letters or more than 8 letters
phono_bins = phono_bins[~(phono_bins['length'] < 5) & ~(phono_bins['length'] > 8)]

phono_bins = phono_bins.reset_index()
phono_bins = phono_bins.drop("index", axis = 1)
phono_bins.to_csv("phono_bins.csv", index=False)
phono_bins

Unnamed: 0,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,frequency_bin,length
0,ábaco,ádiso,3.587065,aBako,aDiso,-0.978211,-1.072265,0.094054,<10,5
1,ábaco,ásiso,3.587065,aBako,asiso,-0.978211,-1.052758,0.074547,<10,5
2,ábaco,ápiso,3.587065,aBako,apiso,-0.978211,-0.983937,0.005725,<10,5
3,ábaco,ácasa,3.587065,aBako,akasa,-0.978211,-0.951880,0.026331,<10,5
4,ábaco,ácala,3.587065,aBako,akala,-0.978211,-0.848025,0.130186,<10,5
...,...,...,...,...,...,...,...,...,...,...
50012,zurullo,nucucho,0.006498,suRujo,nukuCo,-1.149236,-1.201352,0.052116,<10,6
50013,zutano,nucino,0.123468,sutano,nusino,-1.043901,-1.145295,0.101394,<10,6
50014,zutano,jucino,0.123468,sutano,xusino,-1.043901,-0.952254,0.091647,<10,6
50015,zutano,nucaro,0.123468,sutano,nukaRo,-1.043901,-1.139930,0.096029,<10,6


In [290]:
#display totals in contingency table
phono_bins_contingency_table = pd.crosstab(phono_bins['frequency_bin'],
                                           phono_bins['length']).iloc[[1, 0, 2], :]
phono_bins_contingency_table

length,5,6,7,8
frequency_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,5387,10561,13384,13515
10-100,1184,1593,1729,1647
>100,307,311,249,150


## Step 18 — Score Difference Sorting

**Within each bin, sort by word-nonword phonotactic score difference from smallest to largest. We want to group together all nonwords that came from the same word, so use the smallest difference between a word and its nonwords to do the sorting. For each word, identify the 3 nonwords that have smallest absolute score difference from the word, and then make a single row in your dataframe which has the word, those 3 nonwords, and the properties (a row will have: ``word``, ``length``, ``freq``, ``word-based phonotactic score`` of word, ``morph-based phonotactic score`` of word, ``word-based orthotactic score`` of word, ``morph-based orthotactic score of word``, and then for each of the 3 nonwords that are closest matched for word-based phonotactic score, have the nonword and its 4 associated scores) of each of the word / nonwords, where the columns for nonwords are sorted such that nonword1 is the one with the smallest score difference.**

In [32]:
phono_bins = phono_bins.sort_values(['spelling', 'score_diff'])
phono_bins['smallest_score_diff'] = phono_bins.groupby("spelling")['score_diff'].transform("min")
phono_bins

Unnamed: 0,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,frequency_bin,length,smallest_score_diff
10,abadejo,amasema,0.172205,aBaDexo,amasema,-1.200831,-1.119467,0.081364,<10,7,0.081364
9,abadejo,amasepa,0.172205,aBaDexo,amasepa,-1.200831,-1.110703,0.090128,<10,7,0.081364
11,abadejo,amaseva,0.172205,aBaDexo,amaseBa,-1.200831,-1.062042,0.138789,<10,7,0.081364
13,abadejo,amaseba,0.172205,aBaDexo,amaseBa,-1.200831,-1.062042,0.138789,<10,7,0.081364
12,abadejo,amasega,0.172205,aBaDexo,amaseGa,-1.200831,-1.034901,0.165930,<10,7,0.081364
...,...,...,...,...,...,...,...,...,...,...,...
48550,útero,úsena,3.210163,uteRo,usena,-1.137436,-1.158379,0.020943,<10,5,0.020943
48547,útero,úvena,3.210163,uteRo,uBena,-1.137436,-1.172119,0.034683,<10,5,0.020943
48548,útero,úrena,3.210163,uteRo,uRena,-1.137436,-1.193654,0.056218,<10,5,0.020943
48551,útero,úmena,3.210163,uteRo,umena,-1.137436,-1.071215,0.066221,<10,5,0.020943


In [47]:
phono_bins_copy = phono_bins.groupby(['frequency_bin', 'length']).apply(
                  lambda x: x.sort_values(by="smallest_score_diff"))
phono_bins_copy = phono_bins_copy.drop(['frequency_bin', 'length'], axis = 1).reset_index().drop("level_2", axis = 1)
phono_bins_copy

Unnamed: 0,frequency_bin,length,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,smallest_score_diff
0,10-100,5,árabe,álije,39.356987,aRaBe,alixe,-1.079542,-0.967563,0.111979,0.000387
1,10-100,5,árabe,áliñe,39.356987,aRaBe,aliYe,-1.079542,-1.140924,0.061382,0.000387
2,10-100,5,árabe,áciñe,39.356987,aRaBe,asiYe,-1.079542,-1.267486,0.187944,0.000387
3,10-100,5,árabe,ácipe,39.356987,aRaBe,asipe,-1.079542,-1.263933,0.184391,0.000387
4,10-100,5,árabe,ácife,39.356987,aRaBe,asife,-1.079542,-1.257337,0.177795,0.000387
...,...,...,...,...,...,...,...,...,...,...,...
50012,>100,8,presente,tremante,149.428532,pResente,tRemante,-0.604469,-0.782188,0.177719,0.175083
50013,>100,8,presente,trebante,149.428532,pResente,tReBante,-0.604469,-0.779552,0.175083,0.175083
50014,>100,8,izquierda,isguienda,116.823285,iskjeRDa,isGjenda,-1.080117,-1.255786,0.175669,0.175669
50015,>100,8,izquierda,isguierta,116.823285,iskjeRDa,isGjeRta,-1.080117,-1.274886,0.194769,0.175669


In [72]:
phono_bins_copy2 = phono_bins.groupby(['frequency_bin', 'length']).apply(
                  lambda x: x.sort_values(by="smallest_score_diff"))
phono_bins_copy2 = phono_bins_copy2.drop(['frequency_bin', 'length'], axis = 1).reset_index().drop("level_2", axis = 1)
phono_bins_copy2

Unnamed: 0,frequency_bin,length,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,smallest_score_diff
0,10-100,5,árabe,álije,39.356987,aRaBe,alixe,-1.079542,-0.967563,0.111979,0.000387
1,10-100,5,árabe,áliñe,39.356987,aRaBe,aliYe,-1.079542,-1.140924,0.061382,0.000387
2,10-100,5,árabe,áciñe,39.356987,aRaBe,asiYe,-1.079542,-1.267486,0.187944,0.000387
3,10-100,5,árabe,ácipe,39.356987,aRaBe,asipe,-1.079542,-1.263933,0.184391,0.000387
4,10-100,5,árabe,ácife,39.356987,aRaBe,asife,-1.079542,-1.257337,0.177795,0.000387
...,...,...,...,...,...,...,...,...,...,...,...
50012,>100,8,presente,tremante,149.428532,pResente,tRemante,-0.604469,-0.782188,0.177719,0.175083
50013,>100,8,presente,trebante,149.428532,pResente,tReBante,-0.604469,-0.779552,0.175083,0.175083
50014,>100,8,izquierda,isguienda,116.823285,iskjeRDa,isGjenda,-1.080117,-1.255786,0.175669,0.175669
50015,>100,8,izquierda,isguierta,116.823285,iskjeRDa,isGjeRta,-1.080117,-1.274886,0.194769,0.175669


In [73]:
phono_bins_copy2 = phono_bins_copy2[phono_bins_copy2['match'].isin(subtlex_copy['word'])]
phono_bins_copy2 = phono_bins_copy2.reset_index()
phono_bins_copy2.drop("index", axis = 1, inplace = True)
phono_bins_copy2

Unnamed: 0,frequency_bin,length,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,smallest_score_diff
0,10-100,5,notar,vetar,15.706404,notaR,betaR,-0.971992,-0.970225,0.001767,0.001767
1,10-100,5,coste,morte,37.306771,koste,moRte,-0.771991,-0.792317,0.020326,0.004633
2,10-100,5,acaso,avisa,46.998994,akaso,aBisa,-0.952178,-0.877996,0.074182,0.005612
3,10-100,5,busca,luzca,75.562295,buska,luska,-0.910984,-0.957531,0.046547,0.006398
4,10-100,5,fraga,clama,12.479995,fRaGa,klama,-1.082067,-1.045282,0.036784,0.006783
...,...,...,...,...,...,...,...,...,...,...,...
332,>100,5,fuera,viera,300.501136,fweRa,bjeRa,-0.916129,-0.826112,0.090017,0.090017
333,>100,6,medida,segada,170.632503,meDiDa,seGaDa,-0.853793,-0.797864,0.055929,0.001012
334,>100,6,apenas,abetos,116.462629,apenas,aBetos,-0.879684,-0.937857,0.058172,0.037780
335,>100,6,fuerte,vierte,163.055479,fweRte,bjeRte,-0.855577,-0.778420,0.077157,0.077157


In [56]:
#NEW STUFF
subtlex_copy = pd.read_csv("subtlex-esp_words-and-counts.txt", sep = "\t")
subtlex_copy

Unnamed: 0,word,phones,count
0,a,a,965735.0
1,aarón,aaRon,1040.0
2,ábaco,aBako,9.0
3,abad,aBad,48.0
4,abadesa,aBaDesa,12.0
...,...,...,...
93772,zurro,suro,4.0
93773,zurró,suro,2.0
93774,zurrón,suron,2.0
93775,zurullo,suRujo,6.0


In [48]:
#NEW STUFF
phono_bins_copy = phono_bins_copy[~phono_bins_copy['match'].isin(subtlex_copy['word'])]
phono_bins_copy = phono_bins_copy.reset_index()
phono_bins_copy.drop("index", axis = 1, inplace = True)
phono_bins_copy

Unnamed: 0,frequency_bin,length,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,smallest_score_diff
0,10-100,5,árabe,álije,39.356987,aRaBe,alixe,-1.079542,-0.967563,0.111979,0.000387
1,10-100,5,árabe,áliñe,39.356987,aRaBe,aliYe,-1.079542,-1.140924,0.061382,0.000387
2,10-100,5,árabe,áciñe,39.356987,aRaBe,asiYe,-1.079542,-1.267486,0.187944,0.000387
3,10-100,5,árabe,ácipe,39.356987,aRaBe,asipe,-1.079542,-1.263933,0.184391,0.000387
4,10-100,5,árabe,ácife,39.356987,aRaBe,asife,-1.079542,-1.257337,0.177795,0.000387
...,...,...,...,...,...,...,...,...,...,...,...
49675,>100,8,presente,tremante,149.428532,pResente,tRemante,-0.604469,-0.782188,0.177719,0.175083
49676,>100,8,presente,trebante,149.428532,pResente,tReBante,-0.604469,-0.779552,0.175083,0.175083
49677,>100,8,izquierda,isguienda,116.823285,iskjeRDa,isGjenda,-1.080117,-1.255786,0.175669,0.175669
49678,>100,8,izquierda,isguierta,116.823285,iskjeRDa,isGjeRta,-1.080117,-1.274886,0.194769,0.175669


In [59]:
phono_bins_copy.to_csv("phono_bins_copy.csv", index=False)

In [49]:
smallest3 = phono_bins_copy.groupby("spelling")["score_diff"].nsmallest(3).index
smallest3

MultiIndex([( 'abadejo', 34022),
            ( 'abadejo', 34021),
            ( 'abadejo', 34019),
            ( 'abadesa', 28918),
            ( 'abadesa', 28920),
            ( 'abadesa', 28916),
            (  'abadía', 21623),
            (   'abajo',   910),
            ('abalorio', 47164),
            ('abalorio', 47163),
            ...
            (  'órgano',  2630),
            (   'óvalo', 11186),
            (   'óvalo', 11188),
            (   'óvalo', 11187),
            (   'óvulo',  7070),
            (   'óvulo',  7071),
            (   'óvulo',  7072),
            (   'útero',  9032),
            (   'útero',  9033),
            (   'útero',  9034)],
           names=['spelling', None], length=27045)

In [50]:
sorted_index = [index for (word, index) in smallest3.tolist()]
closest_pairs = phono_bins_copy.iloc[sorted_index, :]
closest_pairs.to_csv("closest_pairs.csv", index = False)
closest_pairs

Unnamed: 0,frequency_bin,length,spelling,match,freq,real,pseudo,score_real,score_pseudo,score_diff,smallest_score_diff
34022,<10,7,abadejo,amasema,0.172205,aBaDexo,amasema,-1.200831,-1.119467,0.081364,0.081364
34021,<10,7,abadejo,amasepa,0.172205,aBaDexo,amasepa,-1.200831,-1.110703,0.090128,0.081364
34019,<10,7,abadejo,amaseba,0.172205,aBaDexo,amaseBa,-1.200831,-1.062042,0.138789,0.081364
28918,<10,7,abadesa,amamelo,1.689559,aBaDesa,amamelo,-0.922396,-0.904641,0.017756,0.017756
28920,<10,7,abadesa,amaselo,1.689559,aBaDesa,amaselo,-0.922396,-0.885298,0.037098,0.017756
...,...,...,...,...,...,...,...,...,...,...,...
7071,<10,5,óvulo,óduza,1.575839,oBulo,oDusa,-1.064110,-1.059284,0.004826,0.004826
7072,<10,5,óvulo,ócusa,1.575839,oBulo,okusa,-1.064110,-1.074300,0.010190,0.004826
9032,<10,5,útero,úsena,3.210163,uteRo,usena,-1.137436,-1.158379,0.020943,0.020943
9033,<10,5,útero,úvena,3.210163,uteRo,uBena,-1.137436,-1.172119,0.034683,0.020943


In [51]:
unique_spanish_words = closest_pairs[['spelling', 'length', 'frequency_bin']].drop_duplicates()
unique_spanish_words = unique_spanish_words.reset_index().drop("index", axis = 1)
unique_spanish_words

Unnamed: 0,spelling,length,frequency_bin
0,abadejo,7,<10
1,abadesa,7,<10
2,abadía,6,<10
3,abajo,5,10-100
4,abalorio,8,<10
...,...,...,...
10252,órdago,6,<10
10253,órgano,6,10-100
10254,óvalo,5,<10
10255,óvulo,5,<10


In [52]:
spanish_match_pairs = closest_pairs.groupby("spelling")['match'].apply(list)
spanish_match_pairs = unique_spanish_words.merge(spanish_match_pairs, on = "spelling", how = "left")
spanish_match_pairs[['nonword1', 'nonword2', 'nonword3']] = spanish_match_pairs['match'].apply(pd.Series)
spanish_match_pairs = spanish_match_pairs.drop('match', axis = 1)
spanish_match_pairs = spanish_match_pairs.replace(np.nan, None)
spanish_match_pairs

Unnamed: 0,spelling,length,frequency_bin,nonword1,nonword2,nonword3
0,abadejo,7,<10,amasema,amasepa,amaseba
1,abadesa,7,<10,amamelo,amaselo,amalevo
2,abadía,6,<10,amirío,,
3,abajo,5,10-100,avizo,,
4,abalorio,8,<10,amageria,amagenio,amasocia
...,...,...,...,...,...,...
10252,órdago,6,<10,órnimo,ónvamo,órbimo
10253,órgano,6,10-100,ónfino,ónsino,
10254,óvalo,5,<10,ócaza,ócasa,ócila
10255,óvulo,5,<10,ódusa,óduza,ócusa


## Abbreviation Definitions
**``wbps`` = Word-Based Phonotactic Score**

**``mbps`` = Morph-Based Phonotactic Score**

**``wbos`` = Word-Based Orthotactic Score**

**``mbos`` = Morph-Based Orthotactic Score**

In [53]:
filtered_srilm = pd.read_csv("filtered_srilm")
filtered_srilm.drop("Unnamed: 0", axis = 1, inplace = True)
final_filtered_srilm = pd.read_csv("final_filtered_srilm.csv")
merged_phono = pd.read_csv("merged_phono.csv")
database_numbers = pd.read_csv("database_numbers.csv")

In [66]:
phono_word = filtered_srilm[['spelling', 'match', 'score_real', 'score_pseudo']]
phono_morph = final_filtered_srilm[['spelling', 'real', 'pseudo', 'match', 'score_spanish', 'score_match']]
ortho_word = merged_phono[['spelling', 'match', 'score_spanish_phono', 'score_match_phono']]
ortho_morph = database_numbers[['spelling', 'match', 'score_spanish_morph', 'score_match_morph']]

#put all the data together
all_scores = phono_word.merge(
    phono_morph,
    left_on=["spelling", "match"],
    right_on=["spelling", "match"],
    how="left"
).rename(
    {"score_match": "pseudo_mbps",
     "score_pseudo": "pseudo_wbps",
     "score_real": "real_wbps",
     "score_spanish": "real_mbps"}, 
     axis=1
).merge(
    ortho_word,
    left_on=["spelling", "match"],
    right_on=["spelling", "match"],
    how="left"
).rename(
    {"score_spanish_phono": "real_wbos",
     "score_match_phono": "pseudo_wbos"},
     axis=1
).merge(
    ortho_morph,
    left_on=["spelling", "match"],
    right_on=["spelling", "match"],
    how="left"
).rename(
    {"score_spanish_morph": "real_mbos",
     "score_match_morph": "pseudo_mbos"},
     axis=1
).loc[
    :,
    ["spelling", "real", "real_wbps", "real_mbps", "real_wbos", "real_mbos",
     "match", "pseudo", "pseudo_wbps", "pseudo_mbps", "pseudo_wbos", "pseudo_mbos"]
].dropna().reset_index().drop("index", axis = 1) 

all_scores

Unnamed: 0,spelling,real,real_wbps,real_mbps,real_wbos,real_mbos,match,pseudo,pseudo_wbps,pseudo_mbps,pseudo_wbos,pseudo_mbos
0,ábaco,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ádiso,aDiso,-1.072265,-0.947831,-1.063084,-0.935414
1,ábaco,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ásiso,asiso,-1.052758,-0.911200,-1.151466,-0.972123
2,ábaco,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ápiso,apiso,-0.983937,-0.931994,-1.040188,-0.941413
3,ábaco,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ácasa,akasa,-0.951880,-0.833379,-1.018354,-0.873597
4,ábaco,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ácala,akala,-0.848025,-0.858517,-0.873589,-0.848636
...,...,...,...,...,...,...,...,...,...,...,...,...
86105,zurullo,suRujo,-1.149236,-1.021517,-1.271700,-1.176064,nucucho,nukuCo,-1.201352,-1.081825,-1.220641,-1.107392
86106,zutano,sutano,-1.043901,-0.939211,-1.330688,-1.106246,nucino,nusino,-1.145295,-0.966216,-1.177899,-1.012196
86107,zutano,sutano,-1.043901,-0.939211,-1.330688,-1.106246,jucino,xusino,-0.952254,-0.872232,-1.295764,-1.049329
86108,zutano,sutano,-1.043901,-0.939211,-1.330688,-1.106246,nucaro,nukaRo,-1.139930,-1.116716,-1.179798,-1.105062


In [16]:
def shape(dataframe, label):
    '''
    The function takes in two inputs (a dataframe and a column name)
    and returns a list of word shapes, based on vowels and consonants,
    for each word.
    '''
    
    new_string = ""
    match_list = []
    spanish_list = []
    error1 = ""
    error2 = ""
    index_error = ""
    consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'ñ', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z',
                  'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'Ñ', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z',]
    vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'ü',
              'A', 'E', 'I', 'O', 'U', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ü']

    for index in range(0, len(dataframe[label])):
        word = (dataframe[label][index])
        if len(word) == 0:
            index_error = (index)
        for letter in word:
            if letter in consonants:
                new_string += "C"
            elif letter in vowels:
                new_string += "V"
            else: 
                error1 += letter
        match_list.append(new_string)
        new_string = ""
    
    return match_list

In [67]:
all_scores.insert(1, "word_shape", shape(all_scores, "spelling"))
all_scores

Unnamed: 0,spelling,word_shape,real,real_wbps,real_mbps,real_wbos,real_mbos,match,pseudo,pseudo_wbps,pseudo_mbps,pseudo_wbos,pseudo_mbos
0,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ádiso,aDiso,-1.072265,-0.947831,-1.063084,-0.935414
1,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ásiso,asiso,-1.052758,-0.911200,-1.151466,-0.972123
2,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ápiso,apiso,-0.983937,-0.931994,-1.040188,-0.941413
3,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ácasa,akasa,-0.951880,-0.833379,-1.018354,-0.873597
4,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ácala,akala,-0.848025,-0.858517,-0.873589,-0.848636
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86105,zurullo,CVCVCCV,suRujo,-1.149236,-1.021517,-1.271700,-1.176064,nucucho,nukuCo,-1.201352,-1.081825,-1.220641,-1.107392
86106,zutano,CVCVCV,sutano,-1.043901,-0.939211,-1.330688,-1.106246,nucino,nusino,-1.145295,-0.966216,-1.177899,-1.012196
86107,zutano,CVCVCV,sutano,-1.043901,-0.939211,-1.330688,-1.106246,jucino,xusino,-0.952254,-0.872232,-1.295764,-1.049329
86108,zutano,CVCVCV,sutano,-1.043901,-0.939211,-1.330688,-1.106246,nucaro,nukaRo,-1.139930,-1.116716,-1.179798,-1.105062


In [68]:
all_scores = all_scores[~all_scores['match'].isin(subtlex_copy['word'])]
all_scores = all_scores.reset_index()
all_scores.drop("index", axis = 1, inplace = True)
all_scores.to_csv("all_scores.csv", index=False)
all_scores

Unnamed: 0,spelling,word_shape,real,real_wbps,real_mbps,real_wbos,real_mbos,match,pseudo,pseudo_wbps,pseudo_mbps,pseudo_wbos,pseudo_mbos
0,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ádiso,aDiso,-1.072265,-0.947831,-1.063084,-0.935414
1,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ásiso,asiso,-1.052758,-0.911200,-1.151466,-0.972123
2,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ápiso,apiso,-0.983937,-0.931994,-1.040188,-0.941413
3,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ácasa,akasa,-0.951880,-0.833379,-1.018354,-0.873597
4,ábaco,VCVCV,aBako,-0.978211,-0.822401,-1.019781,-0.893222,ácala,akala,-0.848025,-0.858517,-0.873589,-0.848636
...,...,...,...,...,...,...,...,...,...,...,...,...,...
85556,zurullo,CVCVCCV,suRujo,-1.149236,-1.021517,-1.271700,-1.176064,nucucho,nukuCo,-1.201352,-1.081825,-1.220641,-1.107392
85557,zutano,CVCVCV,sutano,-1.043901,-0.939211,-1.330688,-1.106246,nucino,nusino,-1.145295,-0.966216,-1.177899,-1.012196
85558,zutano,CVCVCV,sutano,-1.043901,-0.939211,-1.330688,-1.106246,jucino,xusino,-0.952254,-0.872232,-1.295764,-1.049329
85559,zutano,CVCVCV,sutano,-1.043901,-0.939211,-1.330688,-1.106246,nucaro,nukaRo,-1.139930,-1.116716,-1.179798,-1.105062


In [69]:
#merge with nonword 1
final = spanish_match_pairs.merge(all_scores, left_on=['spelling', 'nonword1'],
                                  right_on=['spelling', 'match'],
                                  how = "left"
                                 ).rename(
                                    {"pseudo": "nonword1_phono",
                                     "pseudo_wbps": "nonword1_wbps",
                                     "pseudo_mbps": "nonword1_mbps",
                                     "pseudo_wbos": "nonword1_wbos",
                                     "pseudo_mbos": "nonword1_mbos"},
                                     axis=1).loc[:, 
                                                ["spelling", "word_shape", "real", "length", "frequency_bin", "real_wbps", "real_mbps", "real_wbos", "real_mbos",
                                                 "nonword1", "nonword1_phono", "nonword1_wbps", "nonword1_mbps", "nonword1_wbos", "nonword1_mbos",
                                                 "nonword2", "nonword3"]]

freq_bin = final["frequency_bin"].to_list()

#merge with nonword 2
final = final.merge(all_scores, left_on=['spelling', 'nonword2'],
                                  right_on=['spelling', 'match'],
                                  how = "left"
                                 ).rename(
                                    {"spelling": "spanish",
                                     "pseudo": "nonword2_phono",
                                     "pseudo_wbps": "nonword2_wbps",
                                     "pseudo_mbps": "nonword2_mbps",
                                     "pseudo_wbos": "nonword2_wbos",
                                     "pseudo_mbos": "nonword2_mbos",
                                     "word_shape_x": "word_shape",
                                     "real_x": "real",
                                     "real_wbps_x": "real_wbps",
                                     "real_mbps_x": "real_mbps",
                                     "real_wbos_x": "real_wbos",
                                     "real_mbos_x": "real_mbos"},
                                     axis=1).loc[:, 
                                                ["spanish", "word_shape", "real", "length", "frequency_bin", "real_wbps", "real_mbps", "real_wbos", "real_mbos",
                                                 "nonword1", "nonword1_phono", "nonword1_wbps", "nonword1_mbps", "nonword1_wbos", "nonword1_mbos",
                                                 "nonword2", "nonword2_phono", "nonword2_wbps", "nonword2_mbps", "nonword2_wbos", "nonword2_mbos", "nonword3"]]

final = final.drop(final.columns[4], axis = 1)
final.insert(4, "frequency_bin", freq_bin)
pd.set_option('display.max_columns', None)
final

#merge with nonword 3
final = final.merge(all_scores, left_on=['spanish', 'nonword3'],
                                  right_on=['spelling', 'match'],
                                  how = "left"
                                 ).rename(
                                    {"spanish": "word",
                                     "pseudo": "nonword3_phono",
                                     "pseudo_wbps": "nonword3_wbps",
                                     "pseudo_mbps": "nonword3_mbps",
                                     "pseudo_wbos": "nonword3_wbos",
                                     "pseudo_mbos": "nonword3_mbos",
                                     "word_shape_x": "word_shape",
                                     "real_x": "real",
                                     "real_wbps_x": "real_wbps",
                                     "real_mbps_x": "real_mbps",
                                     "real_wbos_x": "real_wbos",
                                     "real_mbos_x": "real_mbos"},
                                     axis=1).loc[:, 
                                                ["word", "word_shape", "real", "length", "frequency_bin", "real_wbps", "real_mbps", "real_wbos", "real_mbos",
                                                 "nonword1", "nonword1_phono", "nonword1_wbps", "nonword1_mbps", "nonword1_wbos", "nonword1_mbos",
                                                 "nonword2", "nonword2_phono", "nonword2_wbps", "nonword2_mbps", "nonword2_wbos", "nonword2_mbos", 
                                                 "nonword3", "nonword3_phono", "nonword3_wbps", "nonword3_mbps", "nonword3_wbos", "nonword3_mbos"]]

final = final.rename(columns={"real" : "word_phono"})
final.to_csv("final.csv", index=False)
final

Unnamed: 0,word,word_shape,word_phono,length,frequency_bin,real_wbps,real_mbps,real_wbos,real_mbos,nonword1,nonword1_phono,nonword1_wbps,nonword1_mbps,nonword1_wbos,nonword1_mbos,nonword2,nonword2_phono,nonword2_wbps,nonword2_mbps,nonword2_wbos,nonword2_mbos,nonword3,nonword3_phono,nonword3_wbps,nonword3_mbps,nonword3_wbos,nonword3_mbos
0,abadejo,VCVCVCV,aBaDexo,7,<10,-1.200831,-1.023833,-1.032983,-0.984243,amasema,amasema,-1.119467,-0.898795,-1.146725,-0.925743,amasepa,amasepa,-1.110703,-0.925374,-1.181544,-0.981292,amaseba,amaseBa,-1.062042,-0.953674,-1.222683,-1.050147
1,abadesa,VCVCVCV,aBaDesa,7,<10,-0.922396,-0.867021,-0.958502,-0.858800,amamelo,amamelo,-0.904641,-1.004391,-0.909654,-0.980816,amaselo,amaselo,-0.885298,-0.935893,-0.919350,-0.973865,amalevo,amaleBo,-0.968598,-0.920839,-0.958778,-0.901468
2,abadía,VCVCVV,aBaDia,6,<10,-0.931503,-0.928901,-0.954953,-0.898360,amirío,amiRio,-1.074031,-1.019637,-1.000347,-0.939000,,,,,,,,,,,,
3,abajo,VCVCV,aBaxo,5,10-100,-0.897299,-0.899979,-0.855057,-0.884912,avizo,aBiso,-0.936634,-0.849850,-1.020445,-1.024364,,,,,,,,,,,,
4,abalorio,VCVCVCVV,aBaloRjo,8,<10,-0.890321,-0.849770,-0.948355,-0.905642,amageria,amaxeRja,-0.966344,-0.877356,-0.952869,-0.912715,amagenio,amaxenjo,-1.016219,-0.895868,-1.124918,-0.943758,amasocia,amasosja,-1.046141,-0.888239,-1.029229,-0.885489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10252,órdago,VCCVCV,oRDaGo,6,<10,-1.174232,-0.990296,-1.155454,-1.000434,órnimo,oRnimo,-1.149226,-0.987881,-1.154254,-1.009236,ónvamo,ombamo,-1.143998,-1.033408,-1.282773,-1.159105,órbimo,oRBimo,-1.223367,-1.086692,-1.272033,-1.080307
10253,órgano,VCCVCV,oRGano,6,10-100,-1.011449,-0.906425,-1.036424,-0.955281,ónfino,omfino,-1.102622,-0.960591,-1.212969,-1.061907,ónsino,onsino,-1.115095,-0.950551,-1.206797,-1.030665,,,,,,
10254,óvalo,VCVCV,oBalo,5,<10,-0.918871,-1.005842,-1.062990,-1.065607,ócaza,okasa,-1.031922,-0.931022,-1.109884,-1.043426,ócasa,okasa,-1.031922,-0.931022,-1.091128,-0.978277,ócila,osila,-1.117521,-1.001348,-1.025568,-0.991242
10255,óvulo,VCVCV,oBulo,5,<10,-1.064110,-1.001544,-1.202203,-1.094305,ódusa,oDusa,-1.059284,-0.960116,-1.271949,-1.173230,óduza,oDusa,-1.059284,-0.960116,-1.155978,-1.035303,ócusa,okusa,-1.074300,-1.032888,-1.090018,-1.072040


In [65]:
#contingency table of bins
final_contingency_table = pd.crosstab(final.frequency_bin, final.length).iloc[[1, 0, 2], :]
final_contingency_table

length,5,6,7,8
frequency_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<10,1220,2185,2652,2644
10-100,270,334,367,355
>100,70,67,58,35
