I am trying to get the probabilities of the prediction in this notebook

In [1]:
language = 'ja'
import name_transliteration.filtering as filter
import name_transliteration.cleansing as cleanse
import name_transliteration.model_trainer_and_tester as model_trainer_and_tester

In [2]:
# filter testing and training data

my_filter = filter.Filter(language)
my_filter.filterData("./data/")

./data/stream-2021-03-07T03:05:07.831679.gz
./data/stream-2021-01-13T01:21:29.804195.gz
./data/stream-2021-01-13T04:07:46.253913.gz
./data/stream-2021-01-12T23:08:30.828340.gz
./data/stream-2021-01-13T03:38:20.383129.gz
./data/stream-2021-03-07T06:30:03.830030.gz
./data/stream-2021-01-13T04:37:35.200990.gz
./data/stream-2021-03-07T03:34:53.811604.gz
./data/stream-2021-04-17T07:23:41.809159.gz
./data/stream-2021-01-12T22:39:33.810384.gz
./data/stream-2021-03-06T03:25:42.946878.gz
./data/stream-2021-01-12T22:09:26.798946.gz
./data/stream-2021-03-07T04:06:04.938654.gz
./data/stream-2021-03-07T01:39:45.126113.gz
./data/stream-2021-03-07T08:30:55.833881.gz
./data/stream-2021-03-07T02:36:22.842559.gz
./data/stream-2021-03-06T01:33:50.975776.gz
./data/stream-2021-03-06T01:59:57.825571.gz
./data/stream-2021-03-07T07:50:03.791977.gz
./data/stream-2021-04-17T04:49:34.818794.gz
./data/stream-2021-03-06T00:38:21.058969.gz
./data/stream-2021-03-07T01:29:37.938029.gz
./data/stream-2021-01-13T00:02:2

In [3]:
# instantiate the cleanser
my_cleanser = cleanse.Cleanser()
# the split data is uncleansed btw
my_cleanser.splitTrainTest(my_filter.getDataFrame())
# this does the cleansing of the test datasets
my_cleanser.createTestDataSets()
# this does the cleansing of the training dataset
my_cleanser.createTrainDataSet(edit_threshold = 0.3)

In [4]:
my_cleanser.saveTestAndTrain()

Saved cleansed names as: 
train_30_edit_distance_language_cleansed.txt 59355 number of rows. 
test1_cleansed.txt 159 number of rows. 
test2_cleansed.txt 218 number of rows. 
test3_cleansed.txt 499 number of rows. 



In [5]:
trainer_and_tester = model_trainer_and_tester.ModelTrainerAndTester(
    language=language, 
    epochs=20
)
trainer_and_tester.runWholeTrainProcess('train_0_edit_distance_language_cleansed.txt')

Number of unique input tokens: 27
Number of unique output tokens: 1516
Max sequence length for inputs: 17
Max sequence length for outputs: 13
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


INFO:tensorflow:Assets written to: ja_model_20/assets


In [6]:
trainer_and_tester.evaluateOnTestData("ja_model_20")


evaluating on test set with 0 edit threshold...
test loss, test acc: [0.4672572910785675, 0.910982072353363]
evaluating on test set with 0.1 edit threshold...
test loss, test acc: [0.7295599579811096, 0.8757939338684082]
evaluating on test set with 0.25 edit threshold...
test loss, test acc: [1.1537402868270874, 0.8282719254493713]


In [7]:
trainer_and_tester.predict("yuzukichi")

('ゆずきち\n', 0.99999344)

In [8]:
trainer_and_tester.saveTrainingStats()

In [9]:
# create a second filter to get the japanese data again
# filter testing and training data

my_filter2 = filter.Filter(language)
my_filter2.filterData("./data/",5)

./data/stream-2021-03-07T03:05:07.831679.gz
./data/stream-2021-01-13T01:21:29.804195.gz
./data/stream-2021-01-13T04:07:46.253913.gz
./data/stream-2021-01-12T23:08:30.828340.gz
./data/stream-2021-01-13T03:38:20.383129.gz


In [17]:
import re

def transformUserName(line):
    # strip numbers
    text = re.sub(r'\d+', '', line)
    # underscores to spaces
    text = re.sub(r'_', ' ', text)
    # add a space between lower case and upper case words
    text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)
    return text.lower().strip()

In [20]:
probability_list = []

for index, row in my_filter2.getDataFrame().iterrows():
    clean_username = transformUserName(row["username"])
    if len(clean_username) < 17:
        _, prob = trainer_and_tester.predict(clean_username)
        probability_list.append(prob)

KeyboardInterrupt: 

In [15]:
my_filter2.getDataFrame()

Unnamed: 0,username,screen_name,language
0,mi_ano_,ゆ,ja
1,bede908,在りし日のビート,ja
2,ShiN_Lionhart2,ｼﾝこれでも実況者放送日未定,ja
3,shinzan_yaro,職人系ゲーマーしんさんみんなの配偶者,ja
4,yuuuuka08,優華おおきくなぁれ,ja
...,...,...,...
51658,s2i1r2,さー,ja
51659,jimoto__love,やっぱり地元が一番,ja
51660,erusyadaiLucife,ルシフェル,ja
51661,japan_lawson,ローソン公式返信用アカウント,ja


In [21]:
probability_list

[0.9999958,
 0.9998889,
 0.9892664,
 0.9017037,
 0.45914155,
 0.9998821,
 0.98082525,
 0.9967796,
 0.7935591,
 0.9985879,
 0.7883587,
 0.6823935,
 0.89823395,
 0.9703076,
 0.9984864,
 0.9998609,
 0.99928963,
 0.9849045,
 0.97660214,
 0.78389764,
 0.9991955,
 0.46487087,
 0.84513426,
 0.9957283,
 0.94750154,
 0.9999976,
 0.11456417,
 0.9998779,
 0.9838855,
 0.9352533,
 0.9797563,
 0.3183916,
 0.95756245,
 0.99958235,
 0.99287534,
 0.99999857,
 0.5092954,
 0.97196645,
 0.98719877,
 0.99999535,
 0.44574982,
 0.83502835,
 0.8861339,
 0.9999311,
 1.0,
 0.9872479,
 0.99831367,
 0.9999255,
 0.99999964,
 0.7353923,
 0.9999851,
 0.9981346,
 0.8611401,
 0.9999988,
 0.99890137,
 0.99807584,
 0.99986696,
 0.9971277,
 0.9994116,
 0.8164018,
 0.99996996,
 0.675875,
 0.9923781,
 0.9973028,
 0.06197386,
 0.8627356,
 0.99167717,
 0.9981343,
 0.99880755,
 0.9979013,
 0.40939674,
 0.25753796,
 0.9745238,
 0.15866017,
 0.402665,
 0.57305837,
 0.2769329,
 0.94444686,
 0.9996867,
 0.9988305,
 0.9961934,
 0.

In [22]:
trainer_and_tester.max_encoder_seq_length

17