In [53]:
import csv

vocab_types = {
    " (v)": "verb",
    " (prep)": "preposition",
    " (conj)": "conjunction",
    " (adv)": "adverb",
    " (nf)": "noun (feminine)",
    " (nm)": "noun (masculine)",
    " (nm/f)": "noun (masculine/feminine)",
    " (num)": "number",
    " (pron)": "pronoun",
    " (adj)": "adjective",
    " (nc)": "noun (common)",
    " (nf el)": "noun (feminine) (el)",
    " (nmf)": "noun (masculine/feminine)",
    " (interj)": "interjection",
    " (n)": "noun",
    " (adj, adv)": "adjective, adverb",
    " (adj, pron)": "adjective, pronoun",
    " (adj pron)": "adjective, pronoun",
    " (art)": "article",
}

spanish_replacements = {
    "&nbsp;": " ",
    ": ": "; ",
    " :": "; ",
    "<br />": "; ",
    "<br>": "; ",
    "  ": " ",
    "<div>": "",
    "</div>": "",
}

english_replacements = {
    "<div>": "",
    "</div>": "",
}


def read_spanish_csv(filename):
    out = []
    with open(filename, newline="") as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            if i == 0:
                continue
            spanish = row[0]
            english = row[2]
            frequency = int(row[4])

            for key in spanish_replacements:
                spanish = spanish.replace(key, spanish_replacements[key])

            vocab_type = None
            for key in vocab_types:
                if key in spanish:
                    vocab_type = vocab_types[key]
                    spanish = spanish.replace(key, "")
                    break
            assert vocab_type is not None, f"vocab type not found for {spanish}"

            if "[" in spanish and "]" not in spanish:
                spanish = spanish.replace("[", "")
            if "]" in spanish and "[" not in spanish:
                spanish = spanish.replace("]", "")

            spanish = spanish.strip()
            if spanish.endswith(";"):
                spanish = spanish[:-1]

            # clean up english
            for key in english_replacements:
                english = english.replace(key, english_replacements[key])

            out.append({"spanish": spanish, "english": english, "frequency": frequency})
    return sorted(out, key=lambda x: x["frequency"])


vocab = read_spanish_csv("spanish.csv")

for v in vocab:
    print(v)

{'spanish': 'el, la', 'english': 'the', 'frequency': 1}
{'spanish': 'de de', 'english': 'of, from', 'frequency': 2}
{'spanish': 'que', 'english': 'that, which', 'frequency': 3}
{'spanish': 'y', 'english': 'and', 'frequency': 4}
{'spanish': 'a', 'english': 'to, at', 'frequency': 5}
{'spanish': 'en', 'english': 'in, on', 'frequency': 6}
{'spanish': 'un', 'english': 'a, an', 'frequency': 7}
{'spanish': 'ser', 'english': 'to be (norm) (v)', 'frequency': 8}
{'spanish': 'se', 'english': 'oneself, self', 'frequency': 9}
{'spanish': 'no', 'english': 'no', 'frequency': 10}
{'spanish': 'haber', 'english': 'to have (aux v)', 'frequency': 11}
{'spanish': 'por', 'english': 'by, for, through', 'frequency': 12}
{'spanish': 'con', 'english': 'with', 'frequency': 13}
{'spanish': 'su', 'english': 'his/her/their/your (formal)', 'frequency': 14}
{'spanish': 'para', 'english': 'for, to, in order to', 'frequency': 15}
{'spanish': 'como', 'english': 'like, as', 'frequency': 16}
{'spanish': 'estar', 'english'

In [8]:
# Load the data
data = pd.read_csv("spanish.csv")

# print the column names
print(data.columns)

# print the first 5 rows
print(data.head())

Index(['Spanish', 'Picture', 'English', 'Audio', 'Ranking', 'tags', 'deck'], dtype='object')
          Spanish                                   Picture  \
0  absoluto (adj)  <img src="paste-4562608183050241.jpg" />   
1    actual (adj)   <img src="paste-152454159138817.jpg" />   
2  adecuado (adj)   <img src="paste-139895674765313.jpg" />   
3    alemán (adj)   <img src="paste-657787126284289.jpg" />   
4    amplio (adj)  <img src="paste-1060178317279233.jpg" />   

              English                                              Audio  \
0            absolute  [sound:yandex-ed786d1b-be3786b7-8c793710-a9852...   
1       current (adj)  [sound:yandex-ef6e5c52-bbef6262-893468bd-45560...   
2  adequate, suitable  [sound:yandex-e1555468-2330294c-3eda397c-11c5a...   
3              German  [sound:yandex-d5e1e0a3-b3933244-e0c7bbe8-32a8e...   
4  wide, ample, broad  [sound:yandex-4f2c947a-053c11e8-252e58c4-ac7a8...   

   Ranking        tags                                   deck  
0      