In [None]:
import pickle
from functools import reduce
import re
from tqdm import tqdm

import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

In [None]:
path = '/content/drive/MyDrive/tr_project/'

In [None]:
silicone_sentences = pickle.load(open(f'{path}silicone_sentences.pkl', 'rb'))
wiki_sentences = pickle.load(open(f'{path}wiki_sentences.pkl', 'rb'))

In [None]:
all_raw_sentences = silicone_sentences + wiki_sentences

In [None]:
def sentences_summary(sentences):
  n = len(sentences)
  n_lens = [len(s) for s in sentences]
  print(f'#{n}')
  print('Avg:', round(sum(n_lens)/n))
  print('Max:', max(n_lens))
  print('Min:', min(n_lens))

In [None]:
sentences_summary(all_raw_sentences)

In [None]:
all_raw_sentences[81]

In [None]:
def clean_text(sentence):
  sentence = re.sub("[\n\t]", " ", sentence)
  sentence = re.sub(r"[^A-Za-z0-9'\".,;:?!\(\) ]", " ", sentence)
  sentence = re.sub(r"I ' m", "I'm", sentence)
  sentence = re.sub(r" n't", "n't", sentence)
  sentence = re.sub(r" [.,;:?!]", lambda x: x.group()[1], sentence)
  sentence = re.sub(r"[.,;:?!][A-Za-z]", lambda x: x.group()[0] + ' ' + x.group()[1], sentence)
  sentence = re.sub('" [A-Za-z0-9.,;:?!\-\(\)]+ "', lambda x: '"' + x.group()[2:-2] + '"', sentence)
  sentence = re.sub("' [A-Za-z0-9.,;:?!\-\(\)]+ '", lambda x: "'" + x.group()[2:-2] + "'", sentence)
  sentence = re.sub(" '", lambda x: x.group()[1], sentence)
  sentence = re.sub("''", '', sentence)
  sentence = ' '.join(sentence.split())

  return sentence

In [None]:
%%time
all_sentences = [clean_text(sentence) for sentence in all_raw_sentences]

In [None]:
clean_text('Hello world . " hi "')

In [None]:
clean_text("This,is wrong.")

In [None]:
clean_text("does n't")

In [None]:
for s in all_sentences:
  if "I ' m" in s:
    print(s)

In [None]:
all_raw_sentences[81], all_sentences[81]

In [None]:
def label_sentences(sentences):
  labels = []
  for i, s in tqdm(enumerate(sentences), total=len(sentences)):
    indices = []
    for x in re.finditer(r'[A-Za-z]+\'[A-Za-z]{1,2}', s):
      group = x.group()
      start = x.span()[0]
      ap_i = start + group.index("'")
      indices.append(ap_i)

    if i < len(silicone_sentences) or (s.count("'") - len(indices) == 1):
      for x in re.finditer(r' [A-Za-z]+s\'', s):
        group = x.group()
        start = x.span()[0]
        ap_i = start + group.index("'")
        indices.append(ap_i)


    labels.append(sorted(indices))
  
  return labels

def filter_unlabeled(sentences, labels):
  sentences_f = []
  labels_f = []
  for s, l in zip(sentences, labels):
    if len(l) > 0:
      sentences_f.append(s)
      labels_f.append(l)
  return sentences_f, labels_f


def find_indices(text, c):
    return [i for i in range(len(text)) if text[i] in c]

In [None]:
all_labels = [find_indices(s, "'\"") for s in all_sentences]
all_sentences, all_labels = filter_unlabeled(all_sentences, all_labels)
len(all_sentences), len(all_labels)

In [None]:
all_sentences[151], all_labels[151]

In [None]:
all_sentences[181], all_labels[181]

In [None]:
random_i = np.random.randint(0, len(all_sentences), 20)
for i in random_i:
  print(f"{i}:", all_sentences[i], all_labels[i])

In [None]:
all_raw_sentences[57]

In [None]:
all_raw_sentences[7940]

In [None]:
all_sentences[7940]

In [None]:
pickle.dump(all_sentences, open('all_sentences_processed.pkl', 'wb'))
pickle.dump(all_labels, open('all_labels_processed.pkl', 'wb'))

In [None]:
len(all_sentences)

In [None]:

def reconstruct_string(string: str, one_hot, char):
    """
    Reconstructs the original string containing the character in the positions specified by the one-hot vector.
    """
    # initialize an empty result string
    result = ""
    # loop through the one-hot vector
    for i in range(len(one_hot)):
        # if the element in the one-hot vector at the current index is 1
        if one_hot[i] == 1:
            # add the character to the result string
            result += char
        # otherwise, if the element in the one-hot vector at the current index is 0
        else:
            # add the character from the original string at the same index to the result string
            result += string[i]
    # return the result string
    return result

# example call to reconstruct_string function
reconstruct_string("hello world", [0, 0, 1, 1, 0, 0, 0, 0, 0, 1], "l")