In [1]:
language = 'ja'
import name_transliteration.filtering as filter
import name_transliteration.cleansing as cleanse
import name_transliteration.model_trainer_and_tester as model_trainer_and_tester

In [2]:
# filter testing and training data

my_filter = filter.Filter(language)
my_filter.filterData("./data/")

./data/stream-2021-03-07T03:05:07.831679.gz
./data/stream-2021-01-13T01:21:29.804195.gz
./data/stream-2021-01-13T04:07:46.253913.gz
./data/stream-2021-01-12T23:08:30.828340.gz
./data/stream-2021-01-13T03:38:20.383129.gz
./data/stream-2021-03-07T06:30:03.830030.gz
./data/stream-2021-01-13T04:37:35.200990.gz
./data/stream-2021-03-07T03:34:53.811604.gz
./data/stream-2021-04-17T07:23:41.809159.gz
./data/stream-2021-01-12T22:39:33.810384.gz
./data/stream-2021-03-06T03:25:42.946878.gz
./data/stream-2021-01-12T22:09:26.798946.gz
./data/stream-2021-03-07T04:06:04.938654.gz
./data/stream-2021-03-07T01:39:45.126113.gz
./data/stream-2021-03-07T08:30:55.833881.gz
./data/stream-2021-03-07T02:36:22.842559.gz
./data/stream-2021-03-06T01:33:50.975776.gz
./data/stream-2021-03-06T01:59:57.825571.gz
./data/stream-2021-03-07T07:50:03.791977.gz
./data/stream-2021-04-17T04:49:34.818794.gz
./data/stream-2021-03-06T00:38:21.058969.gz
./data/stream-2021-03-07T01:29:37.938029.gz
./data/stream-2021-01-13T00:02:2

In [3]:
# split filtered data into two sets
filtered_set_A = my_filter.getDataFrame().iloc[:int(len(my_filter.getDataFrame())/2)]
filtered_set_B = my_filter.getDataFrame().iloc[int(len(my_filter.getDataFrame())/2):]

In [4]:
filtered_set_B

Unnamed: 0,username,screen_name,language
265297,telu_game,てる,ja
265298,kyoya000802,おっぱい好きょん,ja
265299,wing_10,うぃんぐ音楽好き,ja
265300,Pmin163T,キラ,ja
265301,PonPonz_uuuuu,ぽんず,ja
...,...,...,...
530590,Ohana0876,まあ,ja
530591,ichinosekopipe,まおいつ組コピペ,ja
530592,mjtrognmgm,あ,ja
530593,cat_It_s_noisy,霹,ja


In [5]:
# instantiate the cleanser
my_cleanser = cleanse.Cleanser()

# the split data is uncleansed btw
my_cleanser.splitTrainTest(filtered_set_A)


# this does the cleansing of the test datasets
my_cleanser.createTestDataSets()
# this does the cleansing of the training dataset
my_cleanser.createTrainDataSet(edit_threshold = 0.3)

In [6]:
my_cleanser.saveTestAndTrain()

Saved cleansed names as: 
train_30_edit_distance_language_cleansed.txt 29606 number of rows. 
test1_cleansed.txt 174 number of rows. 
test2_cleansed.txt 234 number of rows. 
test3_cleansed.txt 481 number of rows. 



In [7]:
trainer_and_tester = model_trainer_and_tester.ModelTrainerAndTester(
    language=language, 
    epochs=20
)
trainer_and_tester.runWholeTrainProcess('train_30_edit_distance_language_cleansed.txt', 'test_model')

Number of unique input tokens: 27
Number of unique output tokens: 2239
Max sequence length for inputs: 22
Max sequence length for outputs: 15
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


INFO:tensorflow:Assets written to: test_model/assets


In [8]:
trainer_and_tester.evaluateOnTestData("test_model")


evaluating on test set with 0 edit threshold...
test loss, test acc: [0.39670878648757935, 0.9226053357124329]
evaluating on test set with 0.1 edit threshold...
test loss, test acc: [0.5549078583717346, 0.8968660831451416]
evaluating on test set with 0.25 edit threshold...
test loss, test acc: [0.7256614565849304, 0.8715176582336426]


In [9]:
trainer_and_tester.predict("yuzukichi")

('ゆずきち\n', 0.9979564)

In [10]:
import re

def transformUserName(line):
    # strip numbers
    text = re.sub(r'\d+', '', line)
    # underscores to spaces
    text = re.sub(r'_', ' ', text)
    # add a space between lower case and upper case words
    text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)
    return text.lower().strip()

In [11]:
import numpy as np
def predict(name:str):
    # need to check here if the length off the name is going to exceed the maximum encoder length
    if len(name) > trainer_and_tester.max_encoder_seq_length:
        # return nothing if we cannot handle it
        return ("", 0)
    one_hot_vector = np.zeros(
        (1, trainer_and_tester.max_encoder_seq_length, trainer_and_tester.num_encoder_tokens), dtype="float32"
    )
    sequence_end = 0
    for t, char in enumerate(name):
        if char in trainer_and_tester.input_token_index:
            one_hot_vector[0, t, trainer_and_tester.input_token_index[char]] = 1.0
        else:
            # return nothing if we cannot handle it, token has not been encountered before
            return ("", 0)
        sequence_end = t
    # from when the sequence ends, we have to fill up the rest with spaces
    one_hot_vector[0, sequence_end + 1 :, trainer_and_tester.input_token_index[" "]] = 1.0
    return trainer_and_tester.decode_sequence(one_hot_vector[0:1])

In [12]:
def predict_confidence(name):
    _, conf = predict(name)
    return conf

In [13]:
filtered_set_B["username_pretty"] = filtered_set_B["username"].apply(transformUserName)

In [None]:
filtered_set_B["conf"] = filtered_set_B["username_pretty"].apply(predict_confidence)

In [None]:
filtered_set_B

In [None]:
filtered_set_B.to_pickle("with_confidence_df_re_run")

In [None]:
filtered_set_B.loc[filtered_set_B['conf'] > 0.95]

In [None]:
# drop_list = []

# for index, row in filtered_set_B.iterrows():
#     clean_username = transformUserName(row["username"])
#     _, prob = trainer_and_tester.predict(clean_username)
#     print(prob)
#     if prob < 0.9:
#         drop_list.append(index)