In [12]:
"""Use a traditional neural network setup to classify plural class."""

from re import sub
import numpy as np
import pandas as pd 

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import one_hot

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
"""Read features.txt and initialize feature dictionaries. Adapated from Brandon's LSTM code."""

def get_strings(data_file):
  """Process input file into a list of strings."""
  input_file = open(data_file)
  input_file.readline() # Skip first line
  UR_strings, SR_strings, syll_lengths = [], [], []
  ur_num = 0

  for line in input_file.readlines():
    columns = line.rstrip().split(",")
    if len(columns) == 2:
      ur, sr = columns
      if sr == "" or ur == "":
        continue
      ur_num += 1

      syll_lengths.append(len([seg for seg in ur.split(" ") if seg != ""]))
      UR_strings.append(ur)
      SR_strings.append(sr[-5:]) # Last 5 characters correspond to plural suffix
    else:
       print(line)
       raise Exception("Training data error! All lines should have 2 columns in TD files!")
  input_file.close()

  return UR_strings, SR_strings, syll_lengths

def get_arrays(UR_strings, SR_strings, syll_lengths, symbol2feats, suffix2label, override_max_syll=0):
  if override_max_syll:
    assert override_max_syll > max(syll_lengths)
    max_len = override_max_syll
  else: 
    max_len = max(syll_lengths)
  
  X_list = []
  Y_list = []
  padding_strs = []
  for word_index, syll_length in enumerate(syll_lengths):
    padding = " ".join(["_"]*(max_len-syll_length))
    this_ur = UR_strings[word_index]+" "+padding # Singular form + padding as string
    padding_strs.append(this_ur)
    this_sr = SR_strings[word_index][-5:] # Suffix as string

    #Fix some errors in data files:
    this_ur = sub(" J ", " Y ", this_ur)
    this_ur = sub(" C ", " CH ", this_ur)

    X_list.append([symbol2feats[seg] for seg in this_ur.split(" ") if seg != ""])
    Y_list.append(suffix2label[this_sr])

  X = np.array(X_list)
  Y = np.array(Y_list)

  return X, Y


In [3]:
"""Implements a pooling function to pool together phonetic feature vectors from different segments into a representative embedding vector."""

# Pooling function 

def pool_average(X):
    """Pools phonetic feature vectors by averaging across all segments."""
    # X.shape (n, 5, 19)
    return np.mean(X, axis=1)

def pool_sum(X):
    """Pools phonetic feature vectors by summation across all segments."""
    return np.sum(X, axis=1)

def pool_last(X):
    """Pools phonetic feature vectors by only returning the final segment."""
    last = X[:, -1, :]
    return last

def pool_concat(X):
	"""Pools features by concat each features vector head-to-tail. Results in a word-level feature vector of 5x19"""
	return np.array([np.concatenate(submatrices, axis=0) for submatrices in X])


In [4]:
"""Defines FFN model"""

# Define the FFN model
class SimpleFFN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleFFN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out


In [5]:
"""Load training data and fit logistic regression model"""

FEATURES_FILE = "feats.txt"
TRAINING_DATA = "./Equal_FrequencyCondition/equalFreq_train.txt"
POOLING_FUNC = pool_concat # TODO change pooling function

#inputs
feat_file = open(FEATURES_FILE, "r")
feat_names = feat_file.readline().rstrip().split("\t")[1:] # Skip first space
symbol2feats = {'_': [0.0 for f in feat_names]}

for line in feat_file.readlines():
  columns = line.rstrip().split("\t")
  seg = columns[0]
  values = [{"-":-1.0, "+":1.0, "0":0.0}[v] for v in columns[1:]]
  symbol2feats[seg] = values

#outputs
suffix2label = {
	"Y IY0": 0, #yee
	"W AH0": 1, #wuh
	"L EY0": 2 #lay
	}

URs, SRs, Ls = get_strings(TRAINING_DATA)
X, y = get_arrays(URs, SRs, Ls, symbol2feats, suffix2label)

print(X.shape)
X = POOLING_FUNC(X)
print(X.shape)
# Split the dataset into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = one_hot(torch.tensor(y_train).to(torch.int64), num_classes=3).to(torch.float32)


# Prepare data loaders
dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[-1] # TODO make variable

print(input_dim)
hidden_dim = 100
output_dim = 3

model = SimpleFFN(input_dim, hidden_dim, output_dim)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
num_epochs = 10

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


(1188, 5, 19)
(1188, 95)
95


  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/10], Loss: 0.0372
Epoch [2/10], Loss: 0.0307
Epoch [3/10], Loss: 0.0025
Epoch [4/10], Loss: 0.0008
Epoch [5/10], Loss: 0.0011
Epoch [6/10], Loss: 0.0004
Epoch [7/10], Loss: 0.0003
Epoch [8/10], Loss: 0.0005
Epoch [9/10], Loss: 0.0004
Epoch [10/10], Loss: 0.0004


In [6]:
# Define the inference function
def inference(model, inputs):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # No need to compute gradients for inference
        outputs = model(inputs)
        predictions = torch.sigmoid(outputs)  # Apply sigmoid to get probabilities
        predictions_index = torch.argmax(predictions, dim=1)
    return predictions_index

X_valid = torch.tensor(X_valid, dtype=torch.float32)

# Perform inference
y_pred = inference(model, X_valid)
class_report = classification_report(y_valid, y_pred)
print(class_report)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        84
           1       1.00      1.00      1.00        84
           2       1.00      1.00      1.00        70

    accuracy                           1.00       238
   macro avg       1.00      1.00      1.00       238
weighted avg       1.00      1.00      1.00       238



In [7]:
"""Evaluate test set"""
TEST_DATA = "./Equal_FrequencyCondition/equalFreq_test.txt"

test_SGs, test_PLs, test_Ls = get_strings(TEST_DATA)
X_test, y_test = get_arrays(test_SGs, test_PLs, test_Ls, symbol2feats, suffix2label)

X_test = POOLING_FUNC(X_test) # TODO Pool feature vectors
X_test = torch.tensor(X_test, dtype=torch.float32)

y_test_one_hot = one_hot(torch.tensor(y_test).to(torch.int64), num_classes=3).to(torch.float32)

y_pred = inference(model, X_test)

class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00        96
           2       1.00      1.00      1.00        96

    accuracy                           1.00       288
   macro avg       1.00      1.00      1.00       288
weighted avg       1.00      1.00      1.00       288



In [8]:
"""Evaluate test mutant set"""
TEST_DATA = "./Equal_FrequencyCondition/equalFreq_test_Mutants.txt"

test_SGs, test_PLs, test_Ls = get_strings(TEST_DATA)
X_test, y_test = get_arrays(test_SGs, test_PLs, test_Ls, symbol2feats, suffix2label)

X_test = POOLING_FUNC(X_test) # TODO Pool feature vectors
X_test = torch.tensor(X_test, dtype=torch.float32)

y_test_one_hot = one_hot(torch.tensor(y_test).to(torch.int64), num_classes=3).to(torch.float32)

y_pred = inference(model, X_test)

class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       0.97      1.00      0.98        30
           2       1.00      0.97      0.98        30

    accuracy                           0.99        90
   macro avg       0.99      0.99      0.99        90
weighted avg       0.99      0.99      0.99        90



In [14]:
"""Evaluate test liquids set"""
TEST_DATA = "./Equal_FrequencyCondition/equalFreq_test_L.txt"

test_SGs, test_PLs, test_Ls = get_strings(TEST_DATA)
X_test, y_test = get_arrays(test_SGs, test_PLs, test_Ls, symbol2feats, suffix2label)

X_test = POOLING_FUNC(X_test) # TODO Pool feature vectors
X_test = torch.tensor(X_test, dtype=torch.float32)

y_test_one_hot = one_hot(torch.tensor(y_test).to(torch.int64), num_classes=3).to(torch.float32)

y_pred = inference(model, X_test)

acc = accuracy_score(y_test, y_pred)
print(acc)
# Can't use classification matrix because only one gold class.

0.9270833333333334


In [29]:
"""Evaluate test new templates set"""
TEST_DATA = "./Equal_FrequencyCondition/equalFreq_testNewTemplates.txt"

test_SGs, test_PLs, test_Ls = get_strings(TEST_DATA)
X_test, y_test = get_arrays(test_SGs, test_PLs, test_Ls, symbol2feats, suffix2label)

X_test = POOLING_FUNC(X_test) # TODO Pool feature vectors
X_test = torch.tensor(X_test, dtype=torch.float32)

# Need to pad X_test
padded_X_test = []
for dp in X_test:
    padding_length = input_dim - len(dp)
    padding = [0.0]*padding_length
    padded_dp = dp.tolist() + padding
    padded_X_test.append(padded_dp)

X_test = torch.tensor(padded_X_test, dtype=torch.float32)
y_test_one_hot = one_hot(torch.tensor(y_test).to(torch.int64), num_classes=3).to(torch.float32)

y_pred = inference(model, X_test)

class_report = classification_report(y_test, y_pred)
print(class_report)

<class 'list'>
torch.Size([120, 95])
              precision    recall  f1-score   support

           0       0.51      0.97      0.67        40
           1       0.95      0.50      0.66        40
           2       0.95      0.53      0.68        40

    accuracy                           0.67       120
   macro avg       0.80      0.67      0.67       120
weighted avg       0.80      0.67      0.67       120

