# Steps for Aggregation Analysis

### 1. Load the data
>* Need to lead the data from the Aggregation Preprocessor File. It includes the original sentence scores, the positionally updated sentence scores, and the batch model updated sentence scores.

### 2. Run the Golden Comparison Score Anlysis
>* For each calculated score, compare the scores of golden sentences to the scores of non-golden sentences

### 3. Run the top-k and top-2k analysis
>* Calculate the top k and 2*k scores for each document (where k is the number of golden sentences in that document).
>* See how what percentage of k and 2*k sentences are actually golden sentences

### 4. Create a plot that dives deeper into where golden sentences are located when ordered after aggregation

Load the Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
scored_df = pd.read_pickle('/content/gdrive/MyDrive/Thesis/Data/scored_df.pickle')

Golden Comparison Score

In [None]:
def gold_vs_avg(is_summ, pred_outcome):

  #Flatten both lists
  #flat_sums = [item for sublist in is_summ for item in sublist]

  #Creating lineary array of just 0s and 1s based on each sentence being in the summary
  flat_sums = []
  for i in is_summ:
    for j in i:
      for k in j:
        flat_sums.append(k)
  
  #print(flat_sums)

  #flat_preds = [item for sublist in pred_outcome for item in sublist]
  flat_preds = []
  for i in pred_outcome:
    for j in i:
      for k in j:
        if isinstance(k,list):
          flat_preds.append(k[0])
        else:
          flat_preds.append(k)

  print(len(flat_sums))
  print(len(flat_preds))

  golden_scores = []
  normal_scores = []

  for i in range(len(flat_sums)):
    if flat_sums[i] == 0:
       normal_scores.append(flat_preds[i])
    else:
      golden_scores.append(flat_preds[i])
  
  print('Average Score for Golden Sentences:', sum(golden_scores) / len(golden_scores))
  print('Average Score for Normal Sentences:', sum(normal_scores) / len(normal_scores))

  print('Golden Comparison Score:',  (sum(golden_scores) / len(golden_scores)) / (sum(normal_scores) / len(normal_scores)))

  print('Total Number of Golden Sentences:', len(golden_scores))
  print('Total Number of Normal Sentences:', len(normal_scores))

  fig, axs = plt.subplots(1, 2, figsize=(10, 5))

  # Create a histogram for the first list
  axs[0].hist(golden_scores, bins=10, color='blue', alpha=0.5)
  axs[0].set_title('Golden Sentences')

  # Create a histogram for the second list
  axs[1].hist(normal_scores, bins=10, color='green', alpha=0.5)
  axs[1].set_title('Normal Sentences')

  # Add axis labels and a title for the plot
  #fig.suptitle('Histograms of two lists')
  axs[0].set_xlabel('Values')
  axs[0].set_ylabel('Frequency')
  axs[1].set_xlabel('Values')

  # Show the plot
  plt.show()

  return golden_scores, normal_scores

In [None]:
is_summ = scored_df['batch_labels']

pred_outcome1 = scored_df['pred_outcomes']
print('Results for Linear ------->')
lin_g_scores, lin_n_scores = gold_vs_avg(is_summ, pred_outcome1)

pred_outcome2 = scored_df['pos_update']
print('Results for Positional ------->')
pos_g_scores, pos_n_scores = gold_vs_avg(is_summ, pred_outcome2)

pred_outcome3 = scored_df['bin_class_update']
print('Results for Batch Binary Classification ------->')
class_g_scores, class_n_scores = gold_vs_avg(is_summ, pred_outcome3)

Top-K and Top-2K

In [None]:
def top_k(sent_in_sum, pred_scores, k):
  '''
  This code looks at one case at a time.
  It flattens both lists of scores and in_summs, orders both lists by the pred_scores max, chooses top k scores, sees if they are in the final summary
  It returns the percentage of top k sentences that are actually summaries
  '''

  #Flatten the first list labels for the sentences within a case:
  case_sent_labels = []
  for i in sent_in_sum:
    for j in i:
      case_sent_labels.append(j)

  #Flatten the second list of the predicted scores for each sentennce:
  case_pred_scores = []
  for i in pred_scores:
    for j in i:
      if isinstance(j,list):
          case_pred_scores.append(j[0])
      else:
          case_pred_scores.append(j)

  #Check they are the same length
  # print(len(case_sent_labels))
  # print(len(case_pred_scores))

  #Order both lists from highest to lowest of predcitced scores
  combined = list(zip(case_pred_scores, case_sent_labels)) #Zipping lists to together to make list of tuples based on indcies
  sorted_tuples = sorted(combined, reverse=True) #Sorting based on the first item in the tuple

  #Choose the top-k from the list of tuples
  top_sentences = sorted_tuples[:k]

  #Calcualte Percent of top sentences that are golden
  num_golden = 0
  for sent in top_sentences:
    if sent[1] == 1:
      num_golden += 1
  
  percent_golden = num_golden / k

  return percent_golden, k



In [None]:
case_data = scored_df['case_text']
all_batch_labels = scored_df['batch_labels']
linear_preds = scored_df['pred_outcomes']
pos_preds = scored_df['pos_update']
batch_model_preds = scored_df['bin_class_update']
all_summaries = scored_df['summaries']

In [None]:
def plot_by_k(topk_percent, k_list, title = 'Average Score as Function of K'):
  '''
  Helper function to plot the k values
  '''
  #Make a dictionary holding the values
  holder = {}
  for i in range(len(k_list)):
    k = k_list[i]
    percent = topk_percent[i]

    if k not in holder:
      holder[k] = [percent]
    
    else:
      holder[k].append(percent)

  #Plot the average of the values as function of keys
  averages = {key: round(100 * (sum(values) / len(values)), 2) for key, values in holder.items()}

  # Create a bar chart of the averages
  plt.bar(averages.keys(), averages.values())

  # Set the title and axis labels
  plt.title(title)
  plt.xlabel('K')
  plt.ylabel('% of Top K Sentences that are Golden')

  # Display the plot
  plt.show()

In [None]:
#Start with Linear Analysis

linear_topk_percent = [] #Stores the percentage of top-k sentences that are golden for each case
linear_top2k_percent = []
linear_k = [] #Stores the k number for each case
linear_2k = []

for i in range(len(case_data)):

  k = len(all_summaries[i])

  case_scores = linear_preds[i]
  case_labels = all_batch_labels[i]

  percent_golden, k = top_k(case_labels, case_scores, k)

  two_linear_percent_golden, two_k = top_k(case_labels, case_scores, 2*k)

  linear_topk_percent.append(percent_golden)
  linear_k.append(k)
  linear_top2k_percent.append(two_linear_percent_golden)
  linear_2k.append(two_k)

plot_by_k(linear_topk_percent, linear_k, 'Linear Aggregation')
plot_by_k(linear_top2k_percent, linear_2k)

print('Linear Results --->')
print('Average Percent of top-k sentences:', sum(linear_topk_percent) / len(linear_topk_percent))
print('Average Percent of top-2k sentences:', sum(linear_top2k_percent) / len(linear_top2k_percent))

In [None]:
#Next we do Positional Results

pos_topk_percent = [] #Stores the percentage of top-k sentences that are golden for each case
pos_top2k_percent = []
pos_k = [] #Stores the k number for each case
pos_2k = []

for i in range(len(case_data)):

  k = len(all_summaries[i])

  case_scores = pos_preds[i]
  case_labels = all_batch_labels[i]

  percent_golden, k = top_k(case_labels, case_scores, k)

  two_pos_percent_golden, two_k = top_k(case_labels, case_scores, k*2)

  pos_topk_percent.append(percent_golden)
  pos_k.append(k)
  pos_top2k_percent.append(two_pos_percent_golden)
  pos_2k.append(two_k)

plot_by_k(pos_topk_percent, pos_k, 'Poistional Aggregation')
plot_by_k(pos_top2k_percent, pos_2k)

print('Positional Results --->')
print('Average Percent of top-k sentences:', sum(pos_topk_percent) / len(pos_topk_percent))
print('Average Percent of top-2k sentences:', sum(pos_top2k_percent) / len(pos_top2k_percent))

In [None]:
#Next we do Model Results

model_topk_percent = [] #Stores the percentage of top-k sentences that are golden for each case
model_top2k_percent = []
model_k = [] #Stores the k number for each case
model_2k = []

for i in range(len(case_data)):

  k = len(all_summaries[i])

  case_scores = batch_model_preds[i]
  case_labels = all_batch_labels[i]

  percent_golden, k = top_k(case_labels, case_scores, k)

  two_model_percent_golden, two_k = top_k(case_labels, case_scores, k*2)

  model_topk_percent.append(percent_golden)
  model_k.append(k)
  model_top2k_percent.append(two_model_percent_golden)
  model_2k.append(two_k)

plot_by_k(model_topk_percent, pos_k, title = 'Model Aggreagation')
plot_by_k(model_top2k_percent, pos_2k)

print('Model Results --->')
print('Average Percent of top-k sentences:', sum(model_topk_percent) / len(model_topk_percent))
print('Average Percent of top-2k sentences:', sum(model_top2k_percent) / len(model_top2k_percent))

Create a plot that dives deeper into where golden sentences are located when ordered after aggregation

In [None]:
def exp(sent_in_sum, pred_scores):
  '''
  This code looks at one case at a time.
  It flattens both lists of scores and in_summs, orders both lists by the pred_scores max, chooses top k scores, sees if they are in the final summary
  It returns the percentage of top k sentences that are actually summaries
  '''

  #Flatten the first list labels for the sentences within a case:
  case_sent_labels = []
  for i in sent_in_sum:
    for j in i:
      case_sent_labels.append(j)

  #Flatten the second list of the predicted scores for each sentennce:
  case_pred_scores = []
  for i in pred_scores:
    for j in i:
      if isinstance(j,list):
          case_pred_scores.append(j[0])
      else:
          case_pred_scores.append(j)

  #Order both lists from highest to lowest of predcitced scores
  combined = list(zip(case_pred_scores, case_sent_labels)) #Zipping lists to together to make list of tuples based on indcies
  sorted_tuples = sorted(combined, reverse=True) #Sorting based on the first item in the tuple

  #Expirimenting
  list1 = [x[0] for x in sorted_tuples]
  list2 = [x[1] for x in sorted_tuples]

  return list2

In [None]:
result = []
for i in range(len(case_data)):

  case_scores = batch_model_preds[i]
  case_labels = all_batch_labels[i]

  list_of_yeses = exp(case_labels, case_scores)

  result.append(list_of_yeses)

import numpy as np
import matplotlib.pyplot as plt

# Create a list of lists with different lengths

# Find the maximum length of any internal list
max_length = max(len(l) for l in result)

# Pad each internal list with zeros so they all have the same length
padded_list = [l + [0] * (max_length - len(l)) for l in result]

# Convert the list of lists to a two-dimensional numpy array
arr = np.array(padded_list)

# Compute the histogram of where the 1s occur in each column
hist = np.sum(arr == 1, axis=0)

# Truncate the histogram if it has more than 200 bins
if len(hist) > 100:
    hist = hist[:100]
    
# Plot the histogram
plt.bar(range(len(hist)), hist)
plt.xlim(0, len(hist)-1)
plt.xticks(range(0, len(hist), 20))
plt.title('Weighted by Batch Model')
plt.xlabel('Position from Ordered Scores')
plt.ylabel('Number of Golden Sentences')
plt.show()

In [None]:
result = []
for i in range(len(case_data)):

  case_scores = pos_preds[i]
  case_labels = all_batch_labels[i]

  list_of_yeses = exp(case_labels, case_scores)

  result.append(list_of_yeses)

import numpy as np
import matplotlib.pyplot as plt

# Create a list of lists with different lengths

# Find the maximum length of any internal list
max_length = max(len(l) for l in result)

# Pad each internal list with zeros so they all have the same length
padded_list = [l + [0] * (max_length - len(l)) for l in result]

# Convert the list of lists to a two-dimensional numpy array
arr = np.array(padded_list)

# Compute the histogram of where the 1s occur in each column
hist = np.sum(arr == 1, axis=0)

# Truncate the histogram if it has more than 200 bins
if len(hist) > 100:
    hist = hist[:100]
    
# Plot the histogram
plt.bar(range(len(hist)), hist)
plt.xlim(0, len(hist)-1)
plt.title('Weighted by Position')
plt.xticks(range(0, len(hist), 20))
plt.xlabel('Position from Ordered Scores')
plt.ylabel('Number of Golden Sentences')
plt.show()

In [None]:
result = []
for i in range(len(case_data)):

  case_scores = linear_preds[i]
  case_labels = all_batch_labels[i]

  list_of_yeses = exp(case_labels, case_scores)

  result.append(list_of_yeses)

import numpy as np
import matplotlib.pyplot as plt

# Create a list of lists with different lengths

# Find the maximum length of any internal list
max_length = max(len(l) for l in result)

# Pad each internal list with zeros so they all have the same length
padded_list = [l + [0] * (max_length - len(l)) for l in result]

# Convert the list of lists to a two-dimensional numpy array
arr = np.array(padded_list)

# Compute the histogram of where the 1s occur in each column
hist = np.sum(arr == 1, axis=0)

# Truncate the histogram if it has more than 200 bins
if len(hist) > 100:
    hist = hist[:100]
    
# Plot the histogram
plt.bar(range(len(hist)), hist)
plt.xlim(0, len(hist)-1)
plt.title('Linear Ordering')
plt.xticks(range(0, len(hist), 20))
plt.xlabel('Position from Ordered Scores')
plt.ylabel('Number of Golden Sentences')
plt.show()