# Steps for Preprocessing Batch Model Data



### 1.   Import the Data




> *  Need to import the batched data that is created in Sentence_Model_Preporcessor
* The data should not be labelled, it should just be batched


### 2.   Break the batches into sentences and label if a summary occurs within a batch.
> * Save a dataframe with examples of batch, label (0 if summary is not contained within the label, 1 if the summary is contained within the label)

### 3. Balance and Split Data
> * Use undersampling to have 2/3 labels of 0 and 1/3 label of 1.
* Split undersampled data into training and testing set (80% and 20% repspectively)
* Save training and testing as pickle files




In [None]:
def save_as_pickle(filename, data):
  with open(filename, 'wb') as handle:   #Saving as a pickle file
    pickle.dump(data, handle)

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import time
import pickle

Pulling in batched data that was previously created

In [None]:
batched_data = pd.read_pickle('/content/gdrive/MyDrive/Thesis/Data/case_data_with_512_batches.pickle')
batched_data = batched_data.head(20000) #This line controls how many cases I am training on

In [None]:
batches =  batched_data['512_batches']
summaries = batched_data['summaries']

Now going to check if a summary occurs in each batch

In [None]:
def get_cos_sim(summ, pred):
  '''
  Calculates the similarity between two sentences
  '''
  X = summ
  Y = pred

  X_list = word_tokenize(X) 
  Y_list = word_tokenize(Y)

  sw = stopwords.words('english') 
  l1 =[];l2 =[]

  # remove stop words from the string
  X_set = {w for w in X_list if not w in sw} 
  Y_set = {w for w in Y_list if not w in sw}

  # form a set containing keywords of both strings 
  rvector = X_set.union(Y_set) 
  for w in rvector:
      if w in X_set: l1.append(1) # create a vector
      else: l1.append(0)
      if w in Y_set: l2.append(1)
      else: l2.append(0)
  c = 0

  for i in range(len(rvector)):
        c+= l1[i]*l2[i]

  denominator = float((sum(l1)*sum(l2))**0.5)

  if denominator == 0:
    c = 0
    denominator = 1

  cosine = c / denominator

  return cosine

def is_sent_in_summ(b, relevant_summ):
  batch_sent = sent_tokenize(b)
  for b_1 in batch_sent:
    for summ in relevant_summ:
      if get_cos_sim(b_1, summ) >= 0.7:
      #print('1.', s)
      #print('2.', summ)
        return 1
  return 0

In [None]:
def intermediate_save(all_batches, contains_summ, i):
  '''
  saving the file so that we can save memory and if it crashes, we have something to work with
  '''
  input_dict = {'input_sentences': all_batches, 'contains_summ': contains_summ}
  input_df = pd.DataFrame(data = input_dict)

  output_file_name = '/content/gdrive/MyDrive/Thesis/Data/'+str(i)+'_unbalanced_batch_class_data.pickle'
  with open(output_file_name, 'wb') as handle:   #Saving as a pickle file
    pickle.dump(input_df, handle)

  all_batches = []
  contains_summ = []

  print('New File Saved!')

  return all_batches, contains_summ, output_file_name

In [None]:
start_time = time.time()

all_batches = []
contains_summ = []
total_sums = 0
total_batches_with_sum = 0
list_of_files = []

for i in range(len(batches)):

  if i % 2000 == 0 and i != 0:
    print('Done with', i, 'cases.')
    total_batches_with_sum += sum(contains_summ)
    all_batches, contains_summ, output_file_name = intermediate_save(all_batches, contains_summ, i)
    list_of_files.append(output_file_name)
    save_as_pickle('/content/gdrive/MyDrive/Thesis/Data/batch_class_files_list', list_of_files)

  one_text = batches[i]
  summ_for_text = summaries[i]
  total_sums += len(summ_for_text)

  for b in one_text:

    score = is_sent_in_summ(b, summ_for_text) #Takes in batch and determines if it contains an summary
    contains_summ.append(score)
    all_batches.append(b)


all_batches, contains_summ, output_file_name = intermediate_save(all_batches, contains_summ, i)
list_of_files.append(output_file_name)
save_as_pickle('/content/gdrive/MyDrive/Thesis/Data/batch_class_files_list', list_of_files)
  
end_time = time.time()
time_taken = end_time - start_time
print('Time Taken', time_taken)

print('Total amount of summaries', total_sums)
print('Total number of batches with summaries', total_batches_with_sum)

Now undersampling and splitting the data

In [None]:
from sklearn.model_selection import train_test_split

def balance_and_split(data_name):
  '''
  Function that balances, splits and saves the data we gvie to it
  '''
  input_df = pd.read_pickle(data_name)

  pos_df = input_df[input_df.contains_summ == 1]
  neg_df = input_df[input_df.contains_summ == 0]

  num_negative_examples = len(neg_df)
  num_positive_examples = len(pos_df)

  sub_neg_df = neg_df.sample(len(pos_df)*2)

  reduced_neg_examples = len(sub_neg_df) #Tracking num of negative examples we now have
  balanced_df = pd.concat([pos_df, sub_neg_df], axis=0)

  total_num_examples = num_positive_examples + reduced_neg_examples

  train_bdf, test_bdf = train_test_split(balanced_df, test_size=0.2, random_state=42)

  train_file_name = data_name[:-28] + 'batch_training_bdf.pickle'
  test_file_name = data_name[:-28] + 'batch_testing_bdf.pickle'

  with open(train_file_name, 'wb') as handle:   #Saving as a pickle file
    pickle.dump(train_bdf, handle)


  with open(test_file_name, 'wb') as handle:   #Saving as a pickle file
    pickle.dump(test_bdf, handle)

  return train_file_name, test_file_name, total_num_examples, reduced_neg_examples, num_positive_examples, num_negative_examples

In [None]:
name_of_saved_file_list = '/content/gdrive/MyDrive/Thesis/Data/batch_class_files_list'
saved_file_names = pd.read_pickle(name_of_saved_file_list)

list_of_training_files = []
list_of_testing_files = []
balanced_total_examples = 0
reduced_negative_examples = 0
original_positive_examples = 0
original_negative_examples = 0

for data_name in saved_file_names:
  train_file_name, test_file_name, total_num_examples, reduced_neg_examples, num_positive_examples, num_negative_examples = balance_and_split(data_name)

  list_of_training_files.append(train_file_name)
  list_of_testing_files.append(test_file_name)
  balanced_total_examples += total_num_examples
  reduced_negative_examples += reduced_neg_examples
  original_positive_examples += num_positive_examples
  original_negative_examples += num_negative_examples

print('Original Number of negative examples:', original_negative_examples)
print('Original Number of positive examples:', original_positive_examples)
print('Original Total Number of examples:', original_negative_examples + original_positive_examples)
print('Percentage of Examples that were negative:', round(original_negative_examples / (original_negative_examples + original_positive_examples), 3))

print('After balancing...')

print('Reduced Number of Negative examples:', reduced_negative_examples)
print('Total Number of examples:', balanced_total_examples)

print('Training Files:', list_of_training_files)
print('Testing Files:', list_of_testing_files)