In [None]:
%load_ext autoreload
%autoreload 2
from fastai import *
from fastai.text import * 
import difflib
from tqdm import tqdm
PATH = Path('./data/')
import concurrent.futures
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Button, HBox, VBox
import pandas as pd
import numpy as np
import pdb
from LabelMyTextWidget import * 

In [None]:
df_source = pd.read_csv(PATH/'perf_db.csv', index_col=0)
df_generated = pd.read_csv(PATH/'next-generated-batch-dataset-1.csv', index_col=0)

# Post-processing of generated quotes

Because the data has been tokenized, there are post-processing steps to do.

In [None]:
def process_quotes_refactoring(quote: str) -> str:
    res_string = ""
    ponc = ['?', '.', ',', ';', '\n', ':', ':', '-', '(', ')', "n't", "'re", "'s", "'m", "'d", '!']
    quotes_word_list = quote.split(' ')
    next_maj = False
    next_all_caps = False
    
    for i, w in enumerate(quotes_word_list):
        if w == '':
            continue
            
        if len(res_string) == 0:
            w.capitalize()
            
        
        if w == 'xxmaj':
            next_maj = True
            continue
            
        elif next_maj:
            w = w.capitalize()
            next_maj = False
            
        if w == 'xxup':
            next_all_caps = True
            continue
        elif next_all_caps:
            w = w.upper()
            next_all_caps = False    
            
        if w == '.':
            next_all_caps = True
        
        if w == 'i':
            w = w.capitalize()
            
        if w not in ponc and len(res_string) > 0:
            if res_string[-1] != '-':
                res_string = res_string + " "
        
        res_string = res_string + w
            
    return res_string

In [None]:
s = df_generated.text.iloc[0]
print(s), print(process_quotes_refactoring(s))

In [None]:
df_new_quotes = pd.DataFrame(columns=['text'])


for i in tqdm(range(len(df_generated))):
    q = df_generated.text.iloc[i]
    q_corrected = process_quotes_refactoring(q)
    
    df_new_quotes.loc[i] = q_corrected
    
assert(len(df_new_quotes) == len(df_generated))

In [None]:
df_generated = df_new_quotes

# Elimination of plagia: delete generated quotes too close to original corpus

In [None]:
def check_quote_similarity(df_to_check, df_source, threshold=0.80, begin_indice=0, end_indice=-1):
    to_remove = np.array([])
    if end_indice == 0 or end_indice > len(df_to_check) or begin_indice >= end_indice:
        begin_indice = 0
        end_indice = len(df_to_check)-1
        
    for i in tqdm(range(begin_indice, end_indice+1)):
        q = df_to_check.text.iloc[i]
        tab_sim = np.array([])
        for j in range(len(df_source)):
            t = df_source.text.iloc[j]
            sim = difflib.SequenceMatcher(None, q, t).ratio()
            tab_sim = np.append(tab_sim, sim)

        most_sim = np.argmax(tab_sim)
        
        if tab_sim[most_sim] > threshold:
            print(f'Threshold alert: {q}\n Most sim : {df_source.text.iloc[most_sim]},\n indice {tab_sim[most_sim]}\n\n\n')
            to_remove = np.append(to_remove, i)    
    return to_remove

In [None]:
df_test = df_generated

The following cell will execute the check_quote_similarity function using multi-threading. 

In [None]:
n_task = 6
futures = []
res = np.array([])
with ProcessPoolExecutor(max_workers=n_task) as ex:
    number_by_worker = len(df_test) // n_task
    print(f'Number by worker : {number_by_worker}')
    
    for i in range(n_task):
        begin_indice = number_by_worker*i
        end_indice = (number_by_worker*i) + number_by_worker-1
        
        if i == n_task - 1:
            end_indice = len(df_test)-1
        print(f'{i}: will launch work from {begin_indice} to {end_indice}')
        
        
        futures.append(ex.submit(check_quote_similarity,df_test,df_source, begin_indice=begin_indice, end_indice=end_indice))
    
    for future in concurrent.futures.as_completed(futures):
        return_future = future.result()
        print(return_future)
        #res.append(return_future)
        res = np.concatenate((res, return_future))
        print(f'Res : {res}')

res

In [None]:
df_test.loc[res]

The quotes which are too similar are droped

In [None]:
df_q = df_test.drop(res)
len(df_q)

In [None]:
df_q['label'] = -1

# Filtering of generated quotes

The next step of DeepGuru will be to use a second Neural Network to select the best quotes.
However, I have still work to do, to make it effective.
For the time being, there is a manual step of tagging the best quotes. 
This will soon be automated.

The widget from https://github.com/tchambon/LabelMyTextWidget will be used

In [None]:
w = LabelMyTextWidget(df_q, 'text', ['ko', 'ok'], [0, 1], 'label', randomize=False)
w.display()

In [None]:
len(df_q[df_q.label==0]), len(df_q[df_q.label==1])

In [None]:
df_publish = df_q[df_q.label == 1]

In [None]:
df_publish.to_csv(PATH/'next_batch_to_publish.csv')

In [None]:
data_processed = pd.read_csv('data/next_batch_to_publish.csv', index_col=0)

In [None]:
list(data_processed.text)