In [1]:
import pickle
from functools import reduce
import re
from tqdm import tqdm

import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

In [2]:
path = '/content/drive/MyDrive/tr_project/'

In [3]:
silicone_sentences = pickle.load(open(f'{path}silicone_sentences.pkl', 'rb'))
wiki_sentences = pickle.load(open(f'{path}wiki_sentences.pkl', 'rb'))

In [4]:
all_raw_sentences = silicone_sentences + wiki_sentences

In [5]:
def sentences_summary(sentences):
  n = len(sentences)
  n_lens = [len(s) for s in sentences]
  print(f'#{n}')
  print('Avg:', round(sum(n_lens)/n))
  print('Max:', max(n_lens))
  print('Min:', min(n_lens))

In [6]:
sentences_summary(all_raw_sentences)

#328034
Avg: 124
Max: 500
Min: 50


In [7]:
all_raw_sentences[81]

"it's been a week , and i haven't slept that well ."

In [8]:
def clean_text(sentence):
  sentence = re.sub("[\n\t]", " ", sentence)

  """patterns = ["'s", "'ll", "'re", "'ve", "'d", "'m", "n't", "'t"]
  for p in patterns:
    sentence = re.sub(' ' + p, p, sentence)
  
  for p in patterns:
    sentence = re.sub(" ' " + p[1:], p, sentence)"""

  sentence = re.sub(r"[^A-Za-z0-9'\".,;:?!\(\) ]", " ", sentence)
  sentence = re.sub(r"I ' m", "I'm", sentence)
  sentence = re.sub(r" n't", "n't", sentence)
  sentence = re.sub(r" [.,;:?!]", lambda x: x.group()[1], sentence)
  sentence = re.sub(r"[.,;:?!][A-Za-z]", lambda x: x.group()[0] + ' ' + x.group()[1], sentence)
  sentence = re.sub('" [A-Za-z0-9.,;:?!\-\(\)]+ "', lambda x: '"' + x.group()[2:-2] + '"', sentence)
  sentence = re.sub("' [A-Za-z0-9.,;:?!\-\(\)]+ '", lambda x: "'" + x.group()[2:-2] + "'", sentence)
  sentence = re.sub(" '", lambda x: x.group()[1], sentence)
  sentence = re.sub("''", '', sentence)
  sentence = ' '.join(sentence.split())

  return sentence

In [9]:
%%time
all_sentences = [clean_text(sentence) for sentence in all_raw_sentences]

CPU times: user 6.48 s, sys: 57.9 ms, total: 6.54 s
Wall time: 7.03 s


In [10]:
clean_text('Hello world . " hi "')

'Hello world. "hi"'

In [11]:
clean_text("This,is wrong.")

'This, is wrong.'

In [12]:
clean_text("does n't")

"doesn't"

In [13]:
for s in all_sentences:
  if "I ' m" in s:
    print(s)

In [14]:
all_raw_sentences[81], all_sentences[81]

("it's been a week , and i haven't slept that well .",
 "it's been a week, and i haven't slept that well.")

In [15]:
def label_sentences(sentences):
  labels = []
  for i, s in tqdm(enumerate(sentences), total=len(sentences)):
    indices = []
    for x in re.finditer(r'[A-Za-z]+\'[A-Za-z]{1,2}', s):
      group = x.group()
      start = x.span()[0]
      ap_i = start + group.index("'")
      indices.append(ap_i)

    if i < len(silicone_sentences) or (s.count("'") - len(indices) == 1):
      for x in re.finditer(r' [A-Za-z]+s\'', s):
        group = x.group()
        start = x.span()[0]
        ap_i = start + group.index("'")
        indices.append(ap_i)


    labels.append(sorted(indices))
  
  return labels

def filter_unlabeled(sentences, labels):
  sentences_f = []
  labels_f = []
  for s, l in zip(sentences, labels):
    if len(l) > 0:
      sentences_f.append(s)
      labels_f.append(l)
  return sentences_f, labels_f


def find_indices(text, c):
    return [i for i in range(len(text)) if text[i] in c]

In [16]:
all_labels = [find_indices(s, "'\"") for s in all_sentences]
all_sentences, all_labels = filter_unlabeled(all_sentences, all_labels)
len(all_sentences), len(all_labels)

(327325, 327325)

In [17]:
all_sentences[151], all_labels[151]

("it's not raining, darling. the bench isn't wet.", [2, 40])

In [18]:
all_sentences[181], all_labels[181]

("i've never been to a restaurant like this before.", [1])

In [19]:
random_i = np.random.randint(0, len(all_sentences), 20)
for i in random_i:
  print(f"{i}:", all_sentences[i], all_labels[i])

278785: John Sainsbury, Baron Sainsbury of Preston Candover, 94, British businessman and politician, Chairman of Sainsbury's (1969 1992) and member of the House of Lords (since 1989). [114]
246117: Craig'133' Jones also known as 133 (or 133 mHz), The Silent One, or by his number 5, (born on February 11, 1973), is an American musician. [5, 9]
124553: Malevich's architectural projects were known after 1922 Arkhitektoniki. [8]
185418: The main reason it is produced is that goats' milk is easier to digest than that of cows or buffalos. [44]
54031: Ahh, come on! Y'know what y'know what, I think I'm just gon na go home and call Kathy. [15, 27, 48]
194697: Norway's authorities made "a language directive (or rule) in 1880, which"was"made more forceful in 1898. [6, 26, 72, 76]
147772: The books were based on Alcott's childhood experiences with her own three sisters. [30]
32040: i've sent my cv to dozens of companies but nobody has got back to me. [1]
205381: She assumed the role because Arthur'

In [20]:
all_raw_sentences[57]

"i think it's just a crush . you can't be serious ."

In [21]:
all_raw_sentences[7940]

"so when's the next forty-four ? will it be here soon ?"

In [22]:
all_sentences[7940]

"is your website address' www. zhilian. com. cn'?"

In [23]:
pickle.dump(all_sentences, open('all_sentences_processed.pkl', 'wb'))

In [24]:
pickle.dump(all_labels, open('all_labels_processed.pkl', 'wb'))

In [25]:
len(all_sentences)

327325

In [None]:

def reconstruct_string(string: str, one_hot, char):
    """
    Reconstructs the original string containing the character in the positions specified by the one-hot vector.
    """
    # initialize an empty result string
    result = ""
    # loop through the one-hot vector
    for i in range(len(one_hot)):
        # if the element in the one-hot vector at the current index is 1
        if one_hot[i] == 1:
            # add the character to the result string
            result += char
        # otherwise, if the element in the one-hot vector at the current index is 0
        else:
            # add the character from the original string at the same index to the result string
            result += string[i]
    # return the result string
    return result

# example call to reconstruct_string function
reconstruct_string("hello world", [0, 0, 1, 1, 0, 0, 0, 0, 0, 1], "l")

'hello worl'