In [1]:
language = 'ja'
import name_transliteration.filtering as filter
import name_transliteration.cleansing as cleanse
import name_transliteration.model_trainer_and_tester as model_trainer_and_tester

In [2]:
# filter testing and training data
my_filter = filter.Filter(language)
my_filter.filterData("./data/",5)

# split filtered data into two sets
filtered_set_A = my_filter.getDataFrame().iloc[:int(len(my_filter.getDataFrame())/2)]
filtered_set_B = my_filter.getDataFrame().iloc[int(len(my_filter.getDataFrame())/2):]

./data/stream-2021-03-07T03:05:07.831679.gz
./data/stream-2021-01-13T01:21:29.804195.gz
./data/stream-2021-01-13T04:07:46.253913.gz
./data/stream-2021-01-12T23:08:30.828340.gz
./data/stream-2021-01-13T03:38:20.383129.gz


In [3]:
# instantiate the cleanser
my_cleanser = cleanse.Cleanser()

# perform pre-processing on set B and save
filtered_set_B["username"] = filtered_set_B["username"].apply(my_cleanser.transformUserName)
filtered_set_B = filtered_set_B[['username','screen_name']]
filtered_set_B.to_csv('filtered_set_B.txt', header=None, index=None, sep='\t', mode='w')

In [4]:

# the split data is uncleansed btw
my_cleanser.splitTrainTest(filtered_set_A)

# this does the cleansing of the test datasets
my_cleanser.createTestDataSets()
# this does the cleansing of the training dataset
my_cleanser.createTrainDataSet(edit_threshold = 0.3)
# save cleansed test and train files
my_cleanser.saveTestAndTrain()

Saved cleansed names as: 
train_10_edit_distance_language_cleansed.txt 909 number of rows. 
test1_cleansed.txt 150 number of rows. 
test2_cleansed.txt 215 number of rows. 
test3_cleansed.txt 430 number of rows. 



In [5]:
# train model and save model 1
trainer_and_tester = model_trainer_and_tester.ModelTrainerAndTester(
    language=language, 
    epochs=20
)
trainer_and_tester.determineDimensions(['train_10_edit_distance_language_cleansed.txt', 'test1_cleansed.txt', 'test2_cleansed.txt', 'test3_cleansed.txt', 'filtered_set_B.txt'])
train_encode_input, train_decode_input, train_decode_output = trainer_and_tester.processData('train_10_edit_distance_language_cleansed.txt')
trainer_and_tester.buildModel()
trainer_and_tester.trainModel(language + '_model_'+str(20), train_encode_input, train_decode_input, train_decode_output)
trainer_and_tester.createDecoderEncoder(language + '_model_'+str(20))

Number of unique input tokens: 27
Number of unique output tokens: 2972
Max sequence length for inputs: 22
Max sequence length for outputs: 51
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


INFO:tensorflow:Assets written to: ja_model_20/assets


In [6]:
trainer_and_tester.evaluateOnTestData(language + '_model_'+str(20))


evaluating on test set with 0 edit threshold...
test loss, test acc: [0.37864211201667786, 0.937385618686676]
evaluating on test set with 0.1 edit threshold...
test loss, test acc: [0.42912402749061584, 0.9306885600090027]
evaluating on test set with 0.25 edit threshold...
test loss, test acc: [0.4492156207561493, 0.9278613924980164]


In [6]:
# encoder_input_data, decoder_input_data, _ = trainer_and_tester.processData('filtered_set_B.txt')

In [47]:
def predict_confidence(name, model_class):
    _, conf = model_class.predict(name)
    return conf

In [17]:
filtered_set_B

Unnamed: 0,username,screen_name
25831,oharafumi,ふみ
25832,gy w,半片
25833,d op er,リイ
25834,azumi s goods,アズミ取引垢
25835,andy,あんじー
...,...,...
51658,sir,さー
51659,jimoto love,やっぱり地元が一番
51660,erusyadai lucife,ルシフェル
51661,japan lawson,ローソン公式返信用アカウント


In [32]:
# filtered_set_B["conf"] = filtered_set_B["username"].apply(predict_confidence)
# this didn't have the exact same result as the other notebook
# filtered_set_B.to_pickle("with_confidence_df")

In [7]:
import pandas as pd
filtered_set_B = pd.read_pickle("with_confidence_df")

In [8]:
cleansed_set_B_95 = filtered_set_B.loc[filtered_set_B['conf'] > 0.95]

In [10]:
cleansed_set_B_95_just_names = cleansed_set_B_95[['username_pretty','screen_name']]

In [11]:
cleansed_set_B_95_just_names.to_csv('model_cleansed_names.txt', header=None, index=None, sep='\t', mode='w')

In [15]:
# train model 2 and save model 2
trainer_and_tester2 = model_trainer_and_tester.ModelTrainerAndTester(
    language=language, 
    epochs=20
)
# trainer_and_tester2.determineDimensions(['train_10_edit_distance_language_cleansed.txt', 'test1_cleansed.txt', 'test2_cleansed.txt', 'test3_cleansed.txt', 'filtered_set_B.txt','model_cleansed_names.txt'])
# train_encode_input, train_decode_input, train_decode_output = trainer_and_tester2.processData('model_cleansed_names.txt')
# trainer_and_tester2.buildModel()
# trainer_and_tester2.trainModel(language + '_model_'+str(20)+'_v2', train_encode_input, train_decode_input, train_decode_output)
# trainer_and_tester2.createDecoderEncoder(language + '_model_'+str(20)+'_v2')

trainer_and_tester2.runWholeTrainProcess('model_cleansed_names.txt', 'ja_model_20_v2')

Number of unique input tokens: 27
Number of unique output tokens: 937
Max sequence length for inputs: 19
Max sequence length for outputs: 43
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


INFO:tensorflow:Assets written to: ja_model_20_v2/assets


In [17]:
trainer_and_tester2.evaluateOnTestData(language + '_model_'+str(20)+'_v2')


evaluating on test set with 0 edit threshold...
test loss, test acc: [0.4865431785583496, 0.924341082572937]
evaluating on test set with 0.1 edit threshold...
test loss, test acc: [0.5409690141677856, 0.9169280529022217]
evaluating on test set with 0.25 edit threshold...
test loss, test acc: [0.5494069457054138, 0.9152514934539795]


In [18]:
print(trainer_and_tester.predict('reiwatomo'))
print(trainer_and_tester2.predict('reiwatomo'))

('あくち\n', 0.16104132)
('り\n', 0.075644255)


In [45]:
training_set_A = my_cleanser.training_dataframe

Unnamed: 0,index,username,screen_name,language
0,42,tsudayan,つだやん,ja
1,63,mashiroharu,ましろはる,ja
2,67,mana,まな,ja
3,123,uisawashigure,憂沢時雨,ja
4,125,nameneko,なめ猫,ja
...,...,...,...,...
904,25670,airakukaria,哀楽狩阿,ja
905,25719,poroporo,ポロポロ,ja
906,25724,hajikko,はじっこ,ja
907,25736,areumi,あれうみ,ja


In [19]:
trainer_and_tester2.predict('yanpa')

('り\n', 0.073119655)

In [None]:
training_set_A["conf"] = training_set_A["username"].apply(predict_confidence, args=(trainer_and_tester2))

In [13]:
trainer_and_tester2

<name_transliteration.model_trainer_and_tester.ModelTrainerAndTester at 0x16e2e4880>

In [14]:
trainer_and_tester

<name_transliteration.model_trainer_and_tester.ModelTrainerAndTester at 0x16ec401c0>