LSTM
==

In [1]:
import sys
import torch

import pandas as pd
sys.path.append('../../')

from src.models.datagenerator import DataGenerator
from src.models.lstm import GenderLSTM
from src.models.visualization import plot_metrics, plot_prediction_curve, view_plateau, compare_accuracies
from src.models.metrics import baseline_accuracy, statistical_check
from src.models.utils import save_probabilities


In [2]:
filename= '../../data/nlexique.csv'
df = pd.read_csv(filename)
df = df[['lexeme', 'sg', 'gen']].rename(columns={'lexeme': 'orthography', 'sg': 'phonetic trasncription', 'gen': 'gender'})
df.head(3)

Unnamed: 0,orthography,phonetic trasncription,gender
0,à-côté,akOte,m
1,à-coup,aku,m
2,à-peu-près,apØpʁɛ,m


In [3]:
# Datasets with the 'b class
orthography_data_3gen = df[['orthography', 'gender']].dropna()
phonetic_data_3gen = df[['phonetic trasncription', 'gender']].dropna()

In [4]:
# Datasets without the 'b' class
orthography_data_2gen = df[df['gender'] != 'b'][['orthography', 'gender']].dropna()
phonetic_data_2gen = df[df['gender'] != 'b'][['phonetic trasncription', 'gender']].dropna()

Model
==

Train the model by feeding it words in reverse order (since we want to start from the end of the word) and the corresponding genders. The model will learn to predict the gender based on the characters seen so far.

Takes preprocessed words as input:
- tokenized into characters 
- each character mapped to a unique integer id (same with each gender)
- sequences should be padded so they all have the same length

### Hyperparameters


In [5]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_dim = 128
hidden_size = 256
batch_size = 64
n_epochs = 20
lr = 0.001

### Orthographic Form (3 classes)

In [6]:
or3_data_generator = DataGenerator(orthography_data_3gen)
or3_model = GenderLSTM(or3_data_generator, embedding_dim, hidden_size, device=DEVICE)
or3_train, or3_valid = or3_model.train_model(or3_data_generator, n_epochs, batch_size, lr, model_path='../saved_models/orthography_3gen.pth')

  from .autonotebook import tqdm as notebook_tqdm


Epoch: 0


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
or3_predictions = or3_model.predict(or3_data_generator, batch_size)
or3_predictions_df = pd.DataFrame(or3_predictions)
or3_predictions_df

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# plot(model.train_max_accuracies, model.valid_max_accuracies, model.avg_train_losses, model.avg_valid_losses, model.train_max_indecies, model.valid_max_indecies)
plot_metrics(or3_train['accuracy'], or3_valid['accuracy'], or3_train['loss'], or3_valid['loss'])

NameError: name 'or3_train' is not defined

In [None]:
baseline_acc = baseline_accuracy(or3_data_generator, verbose=True)

compare_accuracies(baseline_acc, or3_valid['accuracy'][-1])

In [None]:
word_0 = orthography_data_3gen['orthography'][0]   # 'à-côté'
word_1 = orthography_data_3gen['orthography'][50]  # 'ablution'
word_2 = orthography_data_3gen['orthography'][100] # 'abside'      
word_3 = orthography_data_3gen['orthography'][150] # 'accessoire'
word_4 = orthography_data_3gen['orthography'][200] # 'accu'
word_5 = orthography_data_3gen['orthography'][250] # 'acquiescement'
word_6 = orthography_data_3gen['orthography'][300] # 'addendum'
word_7 = orthography_data_3gen['orthography'][350] # 'admonition'
word_8 = orthography_data_3gen['orthography'][400] # 'aéroglisseur'
word_9 = orthography_data_3gen['orthography'][450] # 'affiliée'

print(f'word: {word_9}')
view_plateau(word_9, '../saved_models/orthography_3gen.pth', reverse=True)

In [None]:
save_probabilities('../saved_models/orthography_3gen.pth', '../results/orthography_3gen.csv')

In [None]:
hyperparameters = {'embed_dim': 128,
                   'hidden_size': 256,
                   'batch_size': 64,
                   'n_epochs': 20,
                   'lr': 0.001}

# Averaging results over 10 runs
or3_t, or3_v = statistical_check(orthography_data_3gen, hyperparameters, runs=10, reverse_nouns=True, df=True, device=DEVICE)

### Phonetic Form (3 classes)

In [None]:
ph3_data_generator = DataGenerator(phonetic_data_3gen, reverse_nouns=True, df=True)
ph3_model = GenderLSTM(ph3_data_generator, embedding_dim, hidden_size, device=DEVICE)
ph3_train, ph3_valid = ph3_model.train_model(ph3_data_generator, n_epochs, batch_size, lr, model_path='../saved_models/phonetic_3gen.pth')

In [None]:
ph3_predictions = ph3_model.predict(ph3_data_generator, batch_size)
ph3_predictions_df = pd.DataFrame(ph3_predictions)
ph3_predictions_df

In [None]:
plot_metrics(ph3_train['accuracy'], ph3_valid['accuracy'], ph3_train['loss'], ph3_valid['loss'])

In [None]:
baseline_acc = baseline_accuracy(ph3_data_generator, verbose=True)

compare_accuracies(baseline_acc, ph3_valid['accuracy'][-1])

In [None]:
# Averaging results over 10 runs
ph3_t, ph3_v = statistical_check(phonetic_data_3gen, hyperparameters, runs=10, device=DEVICE)

In [None]:
word_9 = phonetic_data_3gen['phonetic trasncription'][450] # 'affiliée'

print(f'word: {word_9}')
view_plateau(word_9, '../saved_models/phonetic_3gen.pth', reverse=True)

In [None]:
save_probabilities('../saved_models/phonetic_3gen.pth', '../results/phonetic_3gen.csv')

### Orthographic Form (binary)

In [None]:
or2_data_generator = DataGenerator(orthography_data_2gen, reverse_nouns=True, df=True)
or2_model = GenderLSTM(or2_data_generator, embedding_dim, hidden_size, device=DEVICE)
or2_train, or2_valid = or2_model.train_model(or2_data_generator, n_epochs, batch_size, lr, model_path='../saved_models/orthography_2gen.pth')

In [None]:
or2_predictions = or2_model.predict(or2_data_generator, batch_size)
or2_predictions_df = pd.DataFrame(or2_predictions)
or2_predictions_df

In [None]:
plot_metrics(or2_train['accuracy'], or2_valid['accuracy'], or2_train['loss'], or2_valid['loss'])

In [None]:
baseline_acc = baseline_accuracy(or2_data_generator, verbose=True)

compare_accuracies(baseline_acc, or2_valid['accuracy'][-1])

In [None]:
# Averaging results over 10 runs
or2_t, or2_v = statistical_check(orthography_data_2gen, hyperparameters, runs=10, device=DEVICE)

### Phonetic Form (binary)

In [None]:
ph2_data_generator = DataGenerator(phonetic_data_2gen, reverse_nouns=True, df=True)
ph2_model = GenderLSTM(ph2_data_generator, embedding_dim, hidden_size, device=DEVICE)
ph2_train, ph2_valid = ph2_model.train_model(ph2_data_generator, n_epochs, batch_size, lr, model_path='../saved_models/phonetic_2gen.pth')

In [None]:
ph2_predictions = ph2_model.predict(ph2_data_generator, batch_size)
ph2_predictions_df = pd.DataFrame(ph2_predictions)
ph2_predictions_df

In [None]:
plot_metrics(ph2_train['accuracy'], ph2_valid['accuracy'], ph2_train['loss'], ph2_valid['loss'])

In [None]:
baseline_acc = baseline_accuracy(ph2_data_generator, verbose=True)

compare_accuracies(baseline_acc, ph2_valid['accuracy'][-1])

In [None]:
# Averaging results over 10 runs
ph2_t, ph2_v = statistical_check(phonetic_data_2gen, hyperparameters, runs=10, device=DEVICE)

In [None]:
data = {'Orthographic Form (M/F/B)': [f"{(or3_t['avg_accuracy'].item() * 100):.2f}%", f"{(or3_v['avg_accuracy'].item() * 100):.2f}%"],
        'Phonetic Form (M/F/B)': [f"{(ph3_t['avg_accuracy'].item() * 100):.2f}%", f"{(ph3_v['avg_accuracy'].item() * 100):.2f}%"],
        'Orthographic Form (M/F)': [f"{(or2_t['avg_accuracy'].item() * 100):.2f}%", f"{(or2_v['avg_accuracy'].item() * 100):.2f}%"], 
        'Phonetic Form (M/F)': [f"{(ph2_t['avg_accuracy'].item() * 100):.2f}%", f"{(ph2_v['avg_accuracy'].item() * 100):.2f}%"]}

accuracy_results = pd.DataFrame(data).rename(index={0:'Training Accuracy', 1: 'Validation Accuracy'})
accuracy_results

In [None]:
or3_t

In [None]:
plateau_data = {'Orthographic Form': [or3_t['avg_plateau_beg'], or3_v['avg_plateau_beg']],
                'Phonetic Form': [ph3_t['avg_plateau_beg'], ph3_v['avg_plateau_beg']],
                }

plateau_results = pd.DataFrame(plateau_data).rename(index={0:'Training Set', 1: 'Validation Set'})
plateau_results