In [1]:
language = 'ja'
import name_transliteration.filtering as filter
import name_transliteration.cleansing as cleanse
import name_transliteration.model_trainer_and_tester as model_trainer_and_tester

In [2]:
# filter testing and training data
my_filter = filter.Filter(language)
my_filter.filterData("./data/",5)

# split filtered data into two sets
filtered_set_A = my_filter.getDataFrame().iloc[:int(len(my_filter.getDataFrame())/2)]
filtered_set_B = my_filter.getDataFrame().iloc[int(len(my_filter.getDataFrame())/2):]

./data/stream-2021-03-07T03:05:07.831679.gz
./data/stream-2021-01-13T01:21:29.804195.gz
./data/stream-2021-01-13T04:07:46.253913.gz
./data/stream-2021-01-12T23:08:30.828340.gz
./data/stream-2021-01-13T03:38:20.383129.gz


In [3]:
# instantiate the cleanser
my_cleanser = cleanse.Cleanser()

# perform pre-processing on set B and save
filtered_set_B["username"] = filtered_set_B["username"].apply(my_cleanser.transformUserName)
filtered_set_B = filtered_set_B[['username','screen_name']]
filtered_set_B.to_csv('filtered_set_B.txt', header=None, index=None, sep='\t', mode='w')

In [4]:

# the split data is uncleansed btw
my_cleanser.splitTrainTest(filtered_set_A)

# this does the cleansing of the test datasets
my_cleanser.createTestDataSets()
# this does the cleansing of the training dataset
my_cleanser.createTrainDataSet(edit_threshold = 0.3)
# save cleansed test and train files
my_cleanser.saveTestAndTrain()

Saved cleansed names as: 
train_30_edit_distance_language_cleansed.txt 2300 number of rows. 
test1_cleansed.txt 150 number of rows. 
test2_cleansed.txt 215 number of rows. 
test3_cleansed.txt 430 number of rows. 



In [5]:
# train model and save model 1
trainer_and_tester = model_trainer_and_tester.ModelTrainerAndTester(
    language=language, 
    epochs=20
)
trainer_and_tester.determineDimensions(['train_30_edit_distance_language_cleansed.txt', 'test1_cleansed.txt', 'test2_cleansed.txt', 'test3_cleansed.txt', 'filtered_set_B.txt'])
train_encode_input, train_decode_input, train_decode_output = trainer_and_tester.processData('train_10_edit_distance_language_cleansed.txt')
trainer_and_tester.buildModel()
trainer_and_tester.trainModel(language + '_model_'+str(20), train_encode_input, train_decode_input, train_decode_output)
trainer_and_tester.createDecoderEncoder(language + '_model_'+str(20))

Number of unique input tokens: 27
Number of unique output tokens: 2972
Max sequence length for inputs: 22
Max sequence length for outputs: 51
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


INFO:tensorflow:Assets written to: ja_model_20/assets


In [6]:
trainer_and_tester.evaluateOnTestData(language + '_model_'+str(20))


evaluating on test set with 0 edit threshold...
test loss, test acc: [0.3798932731151581, 0.9375163316726685]
evaluating on test set with 0.1 edit threshold...
test loss, test acc: [0.43153053522109985, 0.9306885600090027]
evaluating on test set with 0.25 edit threshold...
test loss, test acc: [0.44968193769454956, 0.928545355796814]


In [7]:
# encoder_input_data, decoder_input_data, _ = trainer_and_tester.processData('filtered_set_B.txt')

In [8]:
def predict_confidence(name, model_class):
    _, conf = model_class.predict(name)
    return conf

In [9]:
filtered_set_B

Unnamed: 0,username,screen_name
25831,oharafumi,ふみ
25832,gy w,半片
25833,d op er,リイ
25834,azumi s goods,アズミ取引垢
25835,andy,あんじー
...,...,...
51658,sir,さー
51659,jimoto love,やっぱり地元が一番
51660,erusyadai lucife,ルシフェル
51661,japan lawson,ローソン公式返信用アカウント


In [10]:
# filtered_set_B["conf"] = filtered_set_B["username"].apply(predict_confidence)
# this didn't have the exact same result as the other notebook
# filtered_set_B.to_pickle("with_confidence_df")

In [11]:
import pandas as pd
filtered_set_B = pd.read_pickle("with_confidence_df")

In [12]:
cleansed_set_B_95 = filtered_set_B.loc[filtered_set_B['conf'] > 0.95]

In [13]:
cleansed_set_B_95_just_names = cleansed_set_B_95[['username_pretty','screen_name']]

In [14]:
cleansed_set_B_95_just_names.to_csv('model_cleansed_names.txt', header=None, index=None, sep='\t', mode='w')

In [15]:
# train model 2 and save model 2
trainer_and_tester2 = model_trainer_and_tester.ModelTrainerAndTester(
    language=language, 
    epochs=20
)
# trainer_and_tester2.determineDimensions(['train_10_edit_distance_language_cleansed.txt', 'test1_cleansed.txt', 'test2_cleansed.txt', 'test3_cleansed.txt', 'filtered_set_B.txt','model_cleansed_names.txt'])
# train_encode_input, train_decode_input, train_decode_output = trainer_and_tester2.processData('model_cleansed_names.txt')
# trainer_and_tester2.buildModel()
# trainer_and_tester2.trainModel(language + '_model_'+str(20)+'_v2', train_encode_input, train_decode_input, train_decode_output)
# trainer_and_tester2.createDecoderEncoder(language + '_model_'+str(20)+'_v2')

trainer_and_tester2.runWholeTrainProcess('model_cleansed_names.txt', 'ja_model_20_v2')

Number of unique input tokens: 27
Number of unique output tokens: 937
Max sequence length for inputs: 19
Max sequence length for outputs: 43
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


INFO:tensorflow:Assets written to: ja_model_20_v2/assets


In [16]:
trainer_and_tester2.evaluateOnTestData(language + '_model_'+str(20)+'_v2')


evaluating on test set with 0 edit threshold...
test loss, test acc: [0.4885435402393341, 0.9235658645629883]
evaluating on test set with 0.1 edit threshold...
test loss, test acc: [0.5423395037651062, 0.91595458984375]
evaluating on test set with 0.25 edit threshold...
test loss, test acc: [0.5509322285652161, 0.9147106409072876]


In [17]:
print(trainer_and_tester.predict('reiwatomo'))
print(trainer_and_tester2.predict('reiwatomo'))

('あくち\n', 0.22617334)
('お\n', 0.0751004)


In [18]:
training_set_A = my_cleanser.training_dataframe

In [22]:
training_set_A

Unnamed: 0,index,username,screen_name,language
0,10,amaimono no,甘いもの,ja
1,33,animejikkyobot,アニメ実況,ja
2,42,tsudayan,つだやん,ja
3,47,ayu ko,鮎子,ja
4,63,mashiroharu,ましろはる,ja
...,...,...,...,...
2295,25766,future,ふつれ,ja
2296,25786,ri o,りお,ja
2297,25791,future,ふつれ,ja
2298,25804,paprikasan,パプリカ,ja


In [24]:
trainer_and_tester2.predict('zigo aku')

('お\n', 0.07910247)

In [21]:
training_set_A["conf"] = training_set_A["username"].apply(predict_confidence, args=(trainer_and_tester2,))

KeyboardInterrupt: 

In [29]:
%%time
for i in range(10):
    trainer_and_tester2.predict('zigo aku')

CPU times: user 1.07 s, sys: 112 ms, total: 1.18 s
Wall time: 1.22 s


In [34]:
%%time
for i in range(20):
    trainer_and_tester2.predict('zigo aku')

CPU times: user 2.08 s, sys: 90.9 ms, total: 2.17 s
Wall time: 2.14 s


In [33]:
%%time
for i in range(50):
    trainer_and_tester2.predict('zigo aku')

CPU times: user 5.1 s, sys: 248 ms, total: 5.35 s
Wall time: 5.26 s


In [35]:
%%time
for i in range(100):
    trainer_and_tester2.predict('zigo aku')

CPU times: user 10.1 s, sys: 377 ms, total: 10.5 s
Wall time: 10.2 s


In [36]:
265298/10/60/60

7.3693888888888885

In [45]:
test_encoder_input, test_decoder_input, test_decoder_output = trainer_and_tester.processData('small_names.txt')

In [46]:
test_encoder_input

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [48]:
prediction = trainer_and_tester.model.predict([test_encoder_input, test_decoder_input])

In [68]:
trainer_and_tester.model

<tensorflow.python.keras.engine.functional.Functional at 0x182e7bf40>

In [101]:
trainer_and_tester.predict("oharafumi")

('あくち\n', 0.16680214)

In [111]:
trainer_and_tester.reverse_target_char_index[0]

' '

In [113]:
prob_list = []
for i in range(len(prediction)):
    char_idx = np.argmax(prediction[i,0,:])
    print(char_idx)
#     j = 1
#     while char_idx != 0:
#         char_idx = np.argmax(prediction[i,j,:])
#         print(char_idx)
    

42
42
42
42
42
42
42


In [79]:
prediction[0,5,0]

0.9899092

In [64]:
prediction

array([[[8.3742070e-06, 1.4098549e-06, 6.5861754e-03, ...,
         1.3929487e-06, 1.3458267e-06, 1.3111600e-06],
        [3.7974369e-04, 8.7453395e-07, 1.7892161e-02, ...,
         9.0219305e-07, 9.1668733e-07, 8.6946778e-07],
        [2.0483108e-03, 5.9531010e-07, 1.1048008e-02, ...,
         6.6449655e-07, 6.2310545e-07, 5.8859285e-07],
        ...,
        [9.9999952e-01, 3.4604161e-13, 2.7603728e-10, ...,
         3.6542796e-13, 3.3970927e-13, 3.3975074e-13],
        [9.9999952e-01, 3.4598550e-13, 2.7601357e-10, ...,
         3.6530045e-13, 3.3962506e-13, 3.3969501e-13],
        [9.9999952e-01, 3.4592481e-13, 2.7598673e-10, ...,
         3.6517507e-13, 3.3954084e-13, 3.3963864e-13]],

       [[2.5005045e-03, 2.6287576e-06, 3.9306772e-03, ...,
         2.6972739e-06, 2.8987240e-06, 2.6490445e-06],
        [2.5983499e-02, 9.0083904e-07, 4.4184611e-03, ...,
         7.9166932e-07, 9.6195356e-07, 8.6236685e-07],
        [1.4645259e-01, 3.5979941e-08, 3.9797407e-04, ...,
         3.750

In [69]:
trainer_and_tester.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 2972)] 0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 290816      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  3306496     input_2[0][0]                    
                                                                 lstm[0][1]                   