In [None]:
!pip install transformers
!pip install datasets
!pip install apache_beam
!pip install mwparserfromhell

In [None]:
import pickle

from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize

import torch
from datasets import load_dataset

In [None]:
nltk.download('punkt')

In [None]:
datasets = ["dyda_da", "dyda_e", "iemocap", "maptask", "meld_e", "meld_s", "mrda", "oasis", "sem", "swda"]

In [None]:
def get_sentences_from_silicone(datasets):
  sentences = []
  for d in datasets:
    dataset = load_dataset("silicone", d)
    for split in dataset:
      for x in tqdm(dataset[split]):
        sentence = x['Utterance']
        if "'" in sentence or "\"" in sentence:
          sentences.append(sentence)

  return sentences

In [None]:
sentences = get_sentences_from_silicone(datasets)

In [None]:
len(sentences)

In [None]:
pickle.dump(sentences, open('silicone_sentences.pkl', 'wb'))

In [None]:
wiki = load_dataset("wikipedia", date="20221120", language="simple", beam_runner='DirectRunner')

In [None]:
wiki

In [None]:
for x in wiki['train']:
  pass

In [None]:
def get_sentences_from_wiki(dataset):
  sentences = []
  for i, x in tqdm(enumerate(dataset['train']), total=len(dataset['train'])):
    text = x['text']
    ss = sent_tokenize(text)
    ss = [s for s in ss if "'" in s or "\"" in s]
    sentences += ss

  return sentences

In [None]:
sentences = get_sentences_from_wiki(wiki)

In [None]:
len(sentences)

In [None]:
pickle.dump(sentences, open('wiki_sentences.pkl', 'wb'))

In [None]:
sum([len(s) for s in sentences]) / len(sentences)

In [None]:
silicone_sentences = pickle.load(open('silicone_sentences.pkl', 'rb'))
wiki_sentences = pickle.load(open('wiki_sentences.pkl', 'rb'))

In [None]:
sum([len(s) for s in silicone_sentences]) / len(silicone_sentences)

In [None]:
max([len(s) for s in silicone_sentences]), min([len(s) for s in silicone_sentences])

In [None]:
def sentences_summary(sentences):
  n = len(sentences)
  n_lens = [len(s) for s in sentences]
  print(f'#{n}')
  print('Avg:', round(sum(n_lens)/n, 2))
  print('Max:', max(n_lens))
  print('Min:', min(n_lens))

In [None]:
sentences_summary(silicone_sentences)

In [None]:
sentences_summary(wiki_sentences)

In [None]:
def filter_by_len(sentences, min_len=50, max_len=500):
  sentences = [s for s in sentences if min_len <= len(s) <= max_len]
  sentences = sorted(sentences, key=lambda s: len(s))
  return sentences


In [None]:
silicone_sentences_f = filter_by_len(silicone_sentences)

In [None]:
sentences_summary(silicone_sentences_f)

In [None]:
wiki_sentences_f = filter_by_len(wiki_sentences)

In [None]:
sentences_summary(wiki_sentences_f)

In [None]:
wiki_sentences_f[600]

In [None]:
wiki_sentences_f

In [None]:
pickle.dump(silicone_sentences_f, open('silicone_sentences.pkl', 'wb'))
pickle.dump(wiki_sentences_f, open('wiki_sentences.pkl', 'wb'))