this notebook is for plotting out graphs and exploring the data

In [1]:
import gc
from tensorflow.keras import backend as K

In [2]:
language = 'ja'
import name_transliteration.filtering as filter
import name_transliteration.cleansing as cleanse
import name_transliteration.model_trainer_and_tester as model_trainer_and_tester

In [3]:
# filter testing and training data
my_filter = filter.Filter(language)
my_filter.filterData("./data/",3)

./data/stream-2021-03-07T03:05:07.831679.gz
./data/stream-2021-01-13T01:21:29.804195.gz
./data/stream-2021-01-13T04:07:46.253913.gz


In [4]:
# instantiate the cleanser
my_cleanser = cleanse.Cleanser()

raw_names = my_filter.getDataFrame()

raw_names["username"] = raw_names["username"].apply(my_cleanser.transformUserName)
raw_names = raw_names[['username','screen_name']]
raw_names.to_csv('raw_names.txt', header=None, index=None, sep='\t', mode='w')

In [5]:

loaded_model = model_trainer_and_tester.ModelTrainerAndTester(
    language=language
)
loaded_model.loadDataParameters()
loaded_model.createDecoderEncoder('model_A')

In [6]:
encoder_input, decoder_input, decoder_output = loaded_model.processData('raw_names.txt')

In [7]:
len(encoder_input)

27357

In [None]:
prediction_list = []

# figure out how many runs it will take
num_runs = int(len(encoder_input) / 1000) + 1

for i in range(num_runs+1):
    # ummm it crashes when it gets to here
    if i == 21:
        break
    if i != 0:
        slice_range_start = (i-1)*1000
        slice_range_finish = i*1000
        prediction = loaded_model.model.predict(
            [encoder_input[slice_range_start:slice_range_finish], 
             decoder_input[slice_range_start:slice_range_finish]])
        K.clear_session()
        _ = gc.collect()
        prediction_list.extend(prediction)
        print("completed prediction iteration: " + str(i) + " of " + str(num_runs))
        


completed prediction iteration: 1 of 28
completed prediction iteration: 2 of 28
completed prediction iteration: 3 of 28
completed prediction iteration: 4 of 28
completed prediction iteration: 5 of 28
completed prediction iteration: 6 of 28
completed prediction iteration: 7 of 28
completed prediction iteration: 8 of 28
completed prediction iteration: 9 of 28
completed prediction iteration: 10 of 28
completed prediction iteration: 11 of 28
completed prediction iteration: 12 of 28
completed prediction iteration: 13 of 28
completed prediction iteration: 14 of 28
completed prediction iteration: 15 of 28
completed prediction iteration: 16 of 28
completed prediction iteration: 17 of 28
completed prediction iteration: 18 of 28
completed prediction iteration: 19 of 28
completed prediction iteration: 20 of 28
completed prediction iteration: 21 of 28


In [None]:
len(prediction_list)

In [None]:
import numpy as np
prob_list = []
for row in prediction_list:
    for time_step in row:
        highest_prob_idx = np.argmax(time_step[:])
        if loaded_model.reverse_target_char_index[highest_prob_idx] == '\n':
            prob = max(time_step[:])
            prob_list.append(prob)
            break

In [None]:
import matplotlib.pyplot as plt
plt.hist(np.array(prob_list), bins='auto')
plt.title('Probability distribution over un-cleansed data')
plt.show()

In [None]:
cleansed_data = my_cleanser.cleanseData(my_filter.getDataFrame(), edit_threshold=0.3)

In [None]:
cleansed_data = cleansed_data[['username','screen_name']]
cleansed_data.to_csv('cleansed_names.txt', header=None, index=None, sep='\t', mode='w')

In [None]:
encoder_input, decoder_input, decoder_output = loaded_model.processData('cleansed_names.txt')

In [None]:
prediction_list = []

# figure out how many runs it will take
num_runs = int(len(encoder_input) / 1000) + 1

for i in range(num_runs+1):
    if i != 0:
        slice_range_start = (i-1)*1000
        slice_range_finish = i*1000
        prediction = loaded_model.model.predict(
            [encoder_input[slice_range_start:slice_range_finish], 
             decoder_input[slice_range_start:slice_range_finish]])
        prediction_list.extend(prediction)
        print("completed prediction iteration: " + str(i) + " of " + str(num_runs))
prob_list = []
for row in prediction_list:
    for time_step in row:
        highest_prob_idx = np.argmax(time_step[:])
        if loaded_model.reverse_target_char_index[highest_prob_idx] == '\n':
            prob = max(time_step[:])
            prob_list.append(prob)
            break
plt.hist(np.array(prob_list), bins='auto')
plt.title('Probability distribution over cleansed data')
plt.show()