In [1]:
from torch import nn
import pandas as pd

## Read the data from a csv file

we can see that there are 517,176 examples

In [2]:
df = pd.read_csv("wiktionary_raw.csv")
df.shape

(517176, 3)

## Lowering the text.

All nouns are lowercase except for proper nouns across our language set (except for German).
Here comes our first decision about cleaning our data. 
- keep possible duplicates because a difference in capitalization?
- Does capitalization matter for German if we are only looking at nouns?
- Do we attempt to remove proper nouns (tend to be capitalzed) by checking across a POS tagger?

For the time being, we will lower all nouns

In [3]:
df["noun"] = df["noun"].str.lower()

Duplicates will now be removed; this is done across all languages, genders and nouns.

Now we can see our total has dropped down to 378,830 which is a ~ 27% loss

In [4]:
# remove dulplicates
df_no_dups = df.drop_duplicates()
df_no_dups.shape

(378830, 3)

## Split df into smaller dfs for each language

In [5]:
languages = df_no_dups['lang'].unique()
languages

array(['fr', 'de', 'pl', 'es'], dtype=object)

In [6]:
dataframes = [df_no_dups[df_no_dups['lang'] == lang] for lang in languages]

### Display distribution of each language dataset
We can see that French has the fewest at 67K nouns, and German the most with 117K.


In [7]:
for idx, lang in enumerate(languages):
    print(f"{lang} dataframe has {dataframes[idx].shape[0]} nouns")

fr dataframe has 66479 nouns
de dataframe has 117755 nouns
pl dataframe has 97949 nouns
es dataframe has 96647 nouns


### Display distribution by gender for each language

In [8]:
new_df = pd.concat(dataframes)

new_df.groupby(['gender','lang']).size().unstack()

lang,de,es,fr,pl
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
feminine,52551.0,40366.0,27974.0,38193.0
masculine,24507.0,50604.0,34107.0,42794.0
neuter,32584.0,,,16836.0


The lowest gender is neuter for Polish with 16,939 nouns

In [9]:
grouped = new_df.groupby(['gender','lang']).size().unstack()
lowest_value = int(grouped.min().min())
lowest_value

16836

## French Data

In [13]:
fr_df = dataframes[0]
fr_df

Unnamed: 0,noun,gender,lang
0,ak-47,,fr
1,akua-ba,,fr
2,alvéole,,fr
3,ambage,,fr
4,ambages,,fr
...,...,...,...
517171,zythophile,masculine,fr
517172,zythum,masculine,fr
517173,zyzel,masculine,fr
517174,zāy,masculine,fr


In [17]:
filtered = fr_df[(~fr_df['noun'].str.contains('-| |\.|1|2|3|4|5|6|7|8|9|0'))]
filtered

Unnamed: 0,noun,gender,lang
2,alvéole,,fr
3,ambage,,fr
4,ambages,,fr
5,ammocètes,,fr
6,aphrodite,,fr
...,...,...,...
517171,zythophile,masculine,fr
517172,zythum,masculine,fr
517173,zyzel,masculine,fr
517174,zāy,masculine,fr


In [18]:
df = filtered.dropna()
df

Unnamed: 0,noun,gender,lang
85,abandonnataire,feminine,fr
86,abélite,feminine,fr
87,abkhaze,feminine,fr
88,ablon,feminine,fr
89,abolitioniste,feminine,fr
...,...,...,...
517171,zythophile,masculine,fr
517172,zythum,masculine,fr
517173,zyzel,masculine,fr
517174,zāy,masculine,fr


In [19]:
labels = df['gender'].unique()
labels

array(['feminine', 'masculine'], dtype=object)

In [34]:
d = {n: l for n, l in zip(df['noun'], df['gender'])}
with open('french_clean.txt', 'w') as f:
    for k, v in d.items():
        f.write(f"{k}, {v}\n")