<a href="https://colab.research.google.com/github/Vince7778/real-vs-fake-words/blob/main/real_vs_fake_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import os
import random
import numpy as np
import json
import csv
import time

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# options

dict_path = '/content/drive/MyDrive/CS 229 Project/data/cmudict-0.7b'
popular_path = '/content/drive/MyDrive/CS 229 Project/data/words_by_popularity.txt'
symbols_path = '/content/drive/MyDrive/CS 229 Project/data/cmudict-0.7b.symbols'
model_save_path = '/content/drive/MyDrive/CS 229 Project/model-save-50000-noemph-popular/'
pseudoword_path = '/content/drive/MyDrive/CS 229 Project/data/pseudowords.csv'
pseudoword_output = '/content/drive/MyDrive/CS 229 Project/pseudoword-output.json'
dict_output = '/content/drive/MyDrive/CS 229 Project/clean-dict-output.json'
popular_words_output = '/content/drive/MyDrive/CS 229 Project/popular-word-output.json'

N = 50000 # training sample count
n_epochs = 40

# whether saved model should be used
load_model_from_file = True

# whether emphasis should be ignored
ignore_emphasis = True

# use top n most common english words as training data
use_popularity = True

# which dictionaries to predict on (pseudoword, real, popular)
prediction_todos = []

In [None]:
max_length = 15
min_length = 3

bad_chars = "[^A-Z]"
start_seq = "START"
end_seq = "END"

In [None]:
def clean_dictionary():

    def alternate_spelling(word):
        return word[-1] == ')' and word[-3] == '(' and word[-2].isdigit()

    def skip(word):
        if not word[0].isalpha() or word[-1] == '.' or re.search(bad_chars, word) or len(word) > max_length or len(word) < min_length:
            return True
        else:
          return False

    clean_dict = {}

    with open(dict_path, encoding="ISO-8859-1") as raw_dict:
        for line in raw_dict:

            # Skip commented lines
            if line[0:3] == ';;;':
                continue

            word, phonetic = line.strip().split('  ')

            if alternate_spelling(word):
                continue

            if skip(word):
                continue

            phonetic = start_seq + " " + phonetic + " " + end_seq
            if word not in clean_dict:
              phonemes = phonetic.split(' ')
              if ignore_emphasis:
                for i in range(len(phonemes)):
                  if phonemes[i][-1].isnumeric():
                    phonemes[i] = phonemes[i][:-1]
              clean_dict[word] = phonemes

    return clean_dict

# Words to Phonemes Model

In [None]:
clean_dict = clean_dictionary()
print(len(clean_dict))

# take n training samples
def get_training_data(N):
  sample_dict = dict()
  if use_popularity:
    with open(popular_path) as popular_file:
      for line in popular_file:
        word = line.split("\t")[0].upper()
        if word in clean_dict:
          sample_dict[word] = clean_dict[word]
          if len(sample_dict) >= N:
            break
    if len(sample_dict) < N:
      print("Only found " + str(len(sample_dict)) + " words")
  else:
    sample_dict = dict(random.sample(list(clean_dict.items()), N))
  return sample_dict.keys(), sample_dict.values()

input_words, input_phonetics = get_training_data(N)

# returns a list of the alphabet characters
def get_alphabet(input_list):
  alphabet = set()
  for v in input_list:
    for c in v:
      alphabet.add(c)
  return sorted(list(alphabet))

characters_alphabet = get_alphabet(input_words)
phonetics_alphabet = get_alphabet(input_phonetics)

num_characters = len(characters_alphabet)
num_phonetics = len(phonetics_alphabet)

word_max_len = max([len(word) for word in input_words])
phonetics_max_len = max([len(phon) for phon in input_phonetics])
# subtract 1 to exclude end index
phonetics_max_len -= 1

characters_index = dict([(char, i) for i, char in enumerate(characters_alphabet)])
phonetics_index = dict([(phon, i) for i, phon in enumerate(phonetics_alphabet)])

116047


In [None]:
encoder_input_tensor = np.zeros((len(input_words), word_max_len, num_characters), dtype='float32')
decoder_input_tensor = np.zeros((len(input_words), phonetics_max_len, num_phonetics), dtype='float32')
decoder_target_tensor = np.zeros((len(input_words), phonetics_max_len, num_phonetics), dtype='float32')

for i, (input_word, input_phonetic) in enumerate(zip(input_words, input_phonetics)):
  for t, char in enumerate(input_word):
    encoder_input_tensor[i, t, characters_index[char]] = 1.
  for t, phon in enumerate(input_phonetic):
    if t < len(input_phonetic) - 1:
      decoder_input_tensor[i, t, phonetics_index[phon]] = 1.
    if t > 0:
      decoder_target_tensor[i, t-1, phonetics_index[phon]] = 1.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

latent_dim = 256

encoder_inputs =Input(shape=(None, num_characters))
encoder = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_phonetics))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)[0]
decoder_dense = Dense(num_phonetics, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

seq2seq = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
def save_model(models):
  if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
  for (i, model) in enumerate(models):
    save_path = model_save_path + "weights-" + str(i)
    model.save_weights(save_path)

def save_history(history):
  if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
  import json
  data = history.history
  json.dump(data, open(model_save_path + "history.json", "w"))

def load_model(models):
  for (i, model) in enumerate(models):
    save_path = model_save_path + "weights-" + str(i)
    model.load_weights(save_path)



In [None]:
seq2seq.compile(optimizer='rmsprop', loss='categorical_crossentropy')

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

if not load_model_from_file:
  history = seq2seq.fit([encoder_input_tensor, decoder_input_tensor], decoder_target_tensor, batch_size=64, epochs=n_epochs, validation_split=0.2)
  save_model([seq2seq, encoder_model, decoder_model])
  save_history(history)
else:
  load_model([seq2seq, encoder_model, decoder_model])



# Words to Phonemes Prediction

In [None]:
def decode_sequence(input_word):
    input_word = input_word.upper()
    input_seq = np.zeros((1, word_max_len, num_characters))
    for t, char in enumerate(input_word):
      input_seq[0, t, characters_index[char]] = 1.

    states_value = encoder_model(input_seq)

    target_seq = np.zeros((1, 1, num_phonetics))
    target_seq[0, 0, phonetics_index[start_seq]] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = phonetics_alphabet[sampled_token_index]

        if sampled_char != end_seq:
          decoded_sentence += sampled_char + " "

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == end_seq or
           len(decoded_sentence) > phonetics_max_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_phonetics))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
# try on random words
random_tries = 10
sampled_words = random.sample(sorted(list(clean_dict.items())), random_tries)
for (word, actual) in sampled_words:
  print(word)
  res = decode_sequence(word)
  print(f"Prediction for {word} is '{res}', actual is '{' '.join(actual[1:-1])}'")

VIDOR
Prediction for VIDOR is 'V IH D', actual is 'V IY D AO R'
ATTRACTIVENESS
Prediction for ATTRACTIVENESS is 'AH T R AE K T IH N', actual is 'AH T R AE K T IH V N AH S'
MENCKEN
Prediction for MENCKEN is 'M EH N K AH', actual is 'M EH NG K AH N'
CREVELING
Prediction for CREVELING is 'K R IH V AH L L', actual is 'K R EH V AH L IH NG'
HEDWIG
Prediction for HEDWIG is 'HH EH D IH', actual is 'HH EH D W IH G'
FORMULATE
Prediction for FORMULATE is 'F AO R M Y AH L L', actual is 'F AO R M Y AH L EY T'
WOLINSKI
Prediction for WOLINSKI is 'W UH L IH S S', actual is 'V AH L IH N S K IY'
ANGE
Prediction for ANGE is 'AE N', actual is 'EY N JH'
PROLEUKIN
Prediction for PROLEUKIN is 'P R OW L AH K AH', actual is 'P R OW L UW K IH N'
CHARMAINE
Prediction for CHARMAINE is 'CH AA R M AH', actual is 'SH AA R M EY N'


In [None]:
import csv
import json
import time

def save_to_file(path, output_dict):
  json_output = json.dumps(output_dict)

  with open(path, "w") as output_file:
    output_file.write(json_output)

def load_pseudowords():
  output = []
  with open(pseudoword_path) as pseudoword_file:
    reader = csv.reader(pseudoword_file)
    output = [row[0].upper() for row in reader]
  return output

def load_popular_words():
  output = []
  with open(popular_path) as popular_file:
    for line in popular_file:
      word = line.split("\t")[0].upper()
      output.append(word.upper())
  return output[:30000]

def run_predictions(output_path, words, limit=None):
  random.shuffle(words)
  start_time = time.time()
  output_dict = {}
  word_count = 0
  for word in words:
    if len(word) > max_length or len(word) < min_length:
      continue
    if re.search(bad_chars, word):
      continue
    res = decode_sequence(word).split(' ')
    output_dict[word] = res
    word_count += 1
    if word_count % 500 == 0:
      elapsed_time = time.time() - start_time
      print(f"Done {word_count} in {elapsed_time} seconds")
      save_to_file(output_path, output_dict)
    if limit is not None and word_count >= limit:
      break

if "pseudoword" in prediction_todos:
  pseudowords = load_pseudowords()
  run_predictions(pseudoword_output, pseudowords)

if "real" in prediction_todos:
  words = list(clean_dict.keys())
  run_predictions(dict_output, words, 13000)

if "popular" in prediction_todos:
  popular_words = load_popular_words()
  run_predictions(popular_words_output, popular_words, 13000)


# Phonemes to Real/Fake Model

In [None]:
# options

pseudoword_input = '/content/drive/MyDrive/CS 229 Project/pseudoword-output.json'
dict_input = '/content/drive/MyDrive/CS 229 Project/popular-word-output.json'
model_save_path = '/content/drive/MyDrive/CS 229 Project/models-rnn/model-save-2023-11-30-popular/'

padding_amount = 17
pad_str = "PAD"
train_test_split = [0.8, 0.1, 0.1]

embedding_dim = 32
num_epochs = 10

load_model_from_file = False

In [None]:
def load_json(file):
  with open(file) as json_file:
    return json.load(json_file)

clean_dict = load_json(dict_input)
pseudoword_dict = load_json(pseudoword_input)

print(len(clean_dict.items()))

def pad_phonemes(p):
  while len(p) < padding_amount:
    p.append(pad_str)

def pad_dict(d):
  for (k, v) in d.items():
    pad_phonemes(v)

pad_dict(clean_dict)
pad_dict(pseudoword_dict)

13000


In [None]:
phoneme_alphabet = get_alphabet(list(clean_dict.values()) + list(pseudoword_dict.values()))
print(phoneme_alphabet)
phoneme_indices = dict([(p, i) for (i, p) in enumerate(phoneme_alphabet)])

['', 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'PAD', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH']


In [None]:
# create input data

# shuffle order of words
def get_combined_words():
  # appends a 0 to pseudowords and a 1 to real words
  app_pseudo = [x + "0" for x in pseudoword_dict.keys()]
  app_real = [x + "1" for x in clean_dict.keys()]
  random.shuffle(app_pseudo)
  random.shuffle(app_real)

  min_len = min([len(app_pseudo), len(app_real)])
  app_pseudo = app_pseudo[:min_len]
  app_real = app_real[:min_len]

  combined = app_pseudo + app_real
  random.shuffle(combined)
  return combined

combined_words = get_combined_words()
print(combined_words[:30])
print(len(combined_words))

def phonemes_to_indices(phonemes):
  return np.asarray([phoneme_indices[p] for p in phonemes])

def create_training_data(words):
  n = len(words)
  x = np.zeros((n, padding_amount))
  y = np.zeros((n, ))
  for (i, w) in enumerate(words):
    phonemes = clean_dict[w[:-1]] if w[-1] == "1" else pseudoword_dict[w[:-1]]
    x[i, :] = phonemes_to_indices(phonemes)
    y[i] = int(w[-1])
  return x, y

def split_data(words):
  n = len(words)
  inds = [0] + [int(n * sum(train_test_split[:i+1])) for i in range(len(train_test_split))]
  x_out, y_out = [], []
  for i in range(len(train_test_split)):
    words_split = words[inds[i] : inds[i+1]]
    x_data, y_data = create_training_data(words_split)
    x_out.append(x_data)
    y_out.append(y_data)
  return x_out, y_out

[x_train, x_valid, x_test], [y_train, y_valid, y_test] = split_data(combined_words)

['REDIRECT1', 'RAZR1', 'CHASUMN0', 'BARONESS1', 'GEARED1', 'BUIRM0', 'INIQUALAP0', 'SCERMAGE0', 'MEDITOMERCIAL0', 'READER1', 'NINETEEN1', 'PORMARTH0', 'RAL1', 'NORMICANT0', 'RELAVION0', 'POTENTIAL1', 'CLIPPING1', 'WHINDAY0', 'HOLAIL0', 'OBVISM0', 'CLASK0', 'DECREST0', 'DECRODY0', 'EXPLORED1', 'FAVENTLY0', 'PERSTERFESE0', 'DRIVE1', 'TONGREASS0', 'CAPEAL0', 'SOTHEBY1']
26000


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()

model.add(Embedding(input_dim=len(phoneme_alphabet), output_dim=embedding_dim, input_length=padding_amount))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 17, 32)            1280      
                                                                 
 lstm_8 (LSTM)               (None, 17, 64)            24832     
                                                                 
 lstm_9 (LSTM)               (None, 64)                33024     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 59201 (231.25 KB)
Trainable params: 59201 (231.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
if not load_model_from_file:
  history = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(x_valid, y_valid))
  save_history(history)
  save_model([model])
else:
  load_model([model])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc}")

Test Accuracy: 0.6361538171768188


# Combine models

In [None]:
# convert to phoneme then to real/fake prediction
def run_whole_pipeline(word):
  phonemes = decode_sequence(word).split(" ")
  pad_phonemes(phonemes)
  indices = np.zeros((1, padding_amount))
  indices[0, :] = np.asarray(phonemes_to_indices(phonemes))
  output = model(indices)
  return float(output[0])

In [None]:
run_whole_pipeline("fjkhdkfjhsdf")

0.35639292001724243

# Feeding text directly in
(instead of phonemes)

In [None]:
pad_char = "[" # Z + 1

def get_combined_words():
  # appends a 0 to pseudowords and a 1 to real words
  app_pseudo = [x.upper() + "0" for x in pseudoword_dict.keys() if len(x) >= min_length and len(x) <= max_length]
  app_real = [x.upper() + "1" for x in clean_dict.keys() if len(x) >= min_length and len(x) <= max_length]
  random.shuffle(app_pseudo)
  random.shuffle(app_real)

  min_len = min([len(app_pseudo), len(app_real)])
  app_pseudo = app_pseudo[:min_len]
  app_real = app_real[:min_len]

  combined = app_pseudo + app_real
  random.shuffle(combined)
  return combined

combined_words = get_combined_words()

def letters_to_indices(word):
  return [ord(c) - ord("A") for c in word]

def pad_word(word):
  while len(word) < max_length:
    word += pad_char
  return word

def create_training_data(words):
  n = len(words)
  x = np.zeros((n, max_length))
  y = np.zeros((n, ))
  for (i, w) in enumerate(words):
    padded_word = pad_word(w[:-1])
    x[i, :] = letters_to_indices(padded_word)
    y[i] = int(w[-1])
  return x, y

[x_train, x_valid, x_test], [y_train, y_valid, y_test] = split_data(combined_words)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()

model.add(Embedding(input_dim=27, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 15, 32)            864       
                                                                 
 lstm_10 (LSTM)              (None, 15, 64)            24832     
                                                                 
 lstm_11 (LSTM)              (None, 64)                33024     
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 58785 (229.63 KB)
Trainable params: 58785 (229.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(x_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc}")

Test Accuracy: 0.7538461685180664
