### Counting the total number of Cases in database

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
summary_file = '/content/gdrive/MyDrive/Thesis/Data/unique_citations_origin_dict.pickle'
all_opinion_data = '/content/gdrive/MyDrive/Thesis/Data/case_data.jsonl.xz'

In [None]:
import json
import pandas as pd
import pickle
import lzma

Number of cases in the case data

In [None]:
count = 0

with lzma.open(all_opinion_data) as in_file:
  for i, line in enumerate(in_file):

    count += 1

    if count % 100000 == 0:
      print(count)

print('Final Number:', count)

### Visualizing the Metadata included for a case

See details of the case data

In [None]:
with lzma.open(all_opinion_data) as in_file:
  for i, line in enumerate(in_file):

    cases = json.loads(str(line,'utf8'))
    print(cases)
    break

In [None]:
import graphviz

def draw_tree(node):
    G = graphviz.Digraph(format='png')
    data_to_graphviz(node, G, parent=None)
    G.render('tree', view=True)

def data_to_graphviz(data, G, parent):
    if isinstance(data, dict):
        for key, value in data.items():
            node = str(key)
            G.node(node, label=node)
            if parent is not None:
                G.edge(parent, node)
            data_to_graphviz(value, G, node)
    elif isinstance(data, list):
        for i, item in enumerate(data):
            node = str(i)
            G.node(node, label=node)
            if parent is not None:
                G.edge(parent, node)
            data_to_graphviz(item, G, node)

draw_tree(cases)

### Analyzing the number of summaries typically in a case

In [None]:
def read_through_pickle(file):
	'''
	This function just takes in a pickle file and reads it. Currently just returns the dictionary from my pickle file
	'''
	objects = []
	with (open(file, 'rb')) as openfile:
		while True:
			try:
				unique_citations = pickle.load(openfile)
				return unique_citations #This is just returning the file
				break
			except EOFError:
				return "There was an Error Loading Pickle File!!!"

num_sums = []
summaries = read_through_pickle(summary_file)
for id in summaries:
  num_sum = len(summaries[id])
  num_sums.append(num_sum)



In [None]:
print('Total Number of cases:', len(num_sums))
print('Average Summary Length:', sum(num_sums)/len(num_sums))
print('Min Num Sums:', min(num_sums))
print('Min Num Sums:', max(num_sums))

In [None]:
from matplotlib import pyplot as plt

bins = [x for x in range(0, 60, int(60 / 12))]
#print(bins)

plt.hist(num_sums, bins = bins)
plt.xlabel('Opinions')
plt.ylabel('Number of Citations')
plt.title('Number of Citations per Opinion')

# Add a vertical line at 4.17
plt.axvline(x=4.17, color='red')

# Add a label to the line
plt.text(5, plt.ylim()[1]*0.9, 'average = 4.17', ha='left', va='center', color='red')

plt.show()

#print('Average Paragrah Length is', sum(all_tokens) / len(all_tokens))

### Find where the summaries typcially occur within a case and fit a curve to it

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import json
import pandas as pd
import pickle
import lzma

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
summary_file = '/content/gdrive/MyDrive/Thesis/Data/unique_citations_origin_dict.pickle'
all_opinion_data = '/content/gdrive/MyDrive/Thesis/Data/case_data.jsonl.xz'

In [None]:
def read_through_pickle(file):
	'''
	This function just takes in a pickle file and reads it. Currently just returns the dictionary from my pickle file
	'''
	objects = []
	with (open(file, 'rb')) as openfile:
		while True:
			try:
				unique_citations = pickle.load(openfile)
				return unique_citations #This is just returning the file
				break
			except EOFError:
				return "There was an Error Loading Pickle File!!!"

In [None]:
def make_summaries_sentences(summaries):
  '''
  This helper function inputs a list of summaries and returns a list of all the setences included in the summaries
  '''
  list_of_sentences = []
  for summ in summaries:
    sentences = sent_tokenize(summ)
    list_of_sentences += sentences

  return list_of_sentences

In [None]:
id_to_sum = read_through_pickle(summary_file)

In [None]:
df_dict = {'case_text': [], 'summaries': []} #Will be {case_text: ____, summaries: _____}
count = 0
with lzma.open(all_opinion_data) as in_file:
  for i, line in enumerate(in_file):

    if count >= 1500:
      break

    cases = json.loads(str(line,'utf8'))
    case_id = cases['id']

    if case_id in id_to_sum:

      count += 1

      case_text = cases['casebody']['data']['opinions'][0]['text']
      case_summaries = [x[1] for x in id_to_sum[case_id]]

      df_dict['case_text'].append(case_text)
      df_dict['summaries'].append(make_summaries_sentences(case_summaries)) #Making sure the summaries consist of a list of individual sentences

print("We are are analyzing ", count, " summaries")

In [None]:
def get_cos_sim(summ, pred):
  '''
  Calculates the similarity between two sentences
  '''
  X = summ
  Y = pred

  X_list = word_tokenize(X) 
  Y_list = word_tokenize(Y)

  sw = stopwords.words('english') 
  l1 =[];l2 =[]

  # remove stop words from the string
  X_set = {w for w in X_list if not w in sw} 
  Y_set = {w for w in Y_list if not w in sw}

  # form a set containing keywords of both strings 
  rvector = X_set.union(Y_set) 
  for w in rvector:
      if w in X_set: l1.append(1) # create a vector
      else: l1.append(0)
      if w in Y_set: l2.append(1)
      else: l2.append(0)
  c = 0

  for i in range(len(rvector)):
        c+= l1[i]*l2[i]

  denominator = float((sum(l1)*sum(l2))**0.5)

  if denominator == 0:
    cosine = 0

  cosine = c / denominator

  return cosine

def is_sent_in_summ(s, relevant_summ):
  for summ in relevant_summ:
    if get_cos_sim(s, summ) >= 0.7:
      #print('1.', s)
      #print('2.', summ)
      return True
  return False

In [None]:
list_of_yes_no = []

for i in range(len(df_dict['case_text'])):
  holder = []

  if i % 100 == 0:
    print('Completed', i, '/ 1500')

  case_text = df_dict['case_text'][i]
  summaries = df_dict['summaries'][i]

  case_sents = sent_tokenize(case_text)
  for s in case_sents:
    if is_sent_in_summ(s, summaries):
      holder.append(1)
    else:
      holder.append(0)
  
  list_of_yes_no.append(holder)

In [None]:
on = '/content/gdrive/MyDrive/Thesis/Data/exploration_holder.pickle'
with open(on, 'wb') as handle:   #Saving as a pickle file
    pickle.dump(list_of_yes_no, handle)

In [None]:
import pandas as pd
list_of_yes_no = pd.read_pickle('/content/gdrive/MyDrive/Thesis/Data/exploration_holder.pickle')

In [None]:
#Creating batches of 50 and summing the number of summaries within them for each case
import numpy as np

num_batches = 50

new_list = []
for labels in list_of_yes_no:
#   batch_size = len(labels) // num_batches
#   if len(labels) % num_batches != 0:
#     batch_size += 1
  batches = np.array_split(labels, num_batches)
  #print(batches)
  #print(len(batches))

  holder = []
  for b in batches:
    holder.append(sum(b))
  
  new_list.append(holder)
  # print(new_list)
  # print(len(new_list))


print(len(new_list))
print(len(new_list[0]))

In [None]:
sums = [sum(x) for x in zip(*new_list)]
print(sums)
avg = []
for s in sums:
  avg.append(s / len(sums))

In [None]:
from matplotlib import pyplot as plt

bins = np.linspace(0, 1, 50)
#print(bins)

plt.scatter(bins, avg)
plt.xlabel('Position')
plt.ylabel('Number of Golden Summaries')
plt.title('The Number of  Golden Summaries as a Function of Position in Judicial Opinion')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

def gaussian(x, a, b, c):
  return a * np.exp(-(x - b)**2 / (2 * c**2))


xdata = np.linspace(0, 1, 50)
ydata = avg

popt, pcov = curve_fit(gaussian, xdata, ydata)

plt.scatter(xdata, ydata, label='data')
plt.plot(xdata, gaussian(xdata, *popt), 'r-', label='fit')
plt.legend()
plt.xlabel('Position')
plt.ylabel('Number of Golden Summaries')
plt.title('Golden Summaries as a Function of Position')
plt.show()

In [None]:
p = np.polyfit(xdata, ydata, 2)
parabola = np.poly1d(p)

plt.scatter(xdata, ydata)
plt.plot(xdata, parabola(xdata), c='r')
plt.xlabel('Position')
plt.ylabel('Number of Golden Summaries')
plt.title('Golden Summaries as a Function of Position')
plt.show()

### Manual Analysis of Training and Testing Accuracies for Sentence Classification Model 

In [None]:
training_accuracy = [79.24908293174136, 81.5572113440767, 81.19195046439629, 81.87350743394191, 82.22430909578749, 82.4095646842428, 82.18759869459943, 82.27832359159308, 82.28307489277466, 82.41335044929397, 82.85999833430499, 82.43364357118234, 84.80903402143423, 84.85531103794837, 84.51152390780874, 85.22455897080349, 85.87335658706735, 85.60392397302269, 85.38793557216549, 85.76047755254322, 85.62630594963159, 86.13607188703466, 85.94986258016158, 85.984143398828, 88.52046320937927, 89.17576387685554, 89.078087375301, 90.17024882520607, 90.59565334048833, 90.0, 89.93578271396989, 90.13182439995026, 89.9153194765204, 90.37227214377407, 89.87257433163988, 90.44467425025853]

training_loss = [0.430480321930342, 0.3964337146283573, 0.39501128196866886, 0.3929519767067335, 0.3865943260699299, 0.3877102724666323, 0.38181565659101074, 0.37933855408939743, 0.3795422718904441, 0.37747477374591043, 0.3759234589406678, 0.37488947112882537, 0.3413790395744338, 0.33568213950884784, 0.34381769557964376, 0.3341206460207601, 0.31935306837078714, 0.33081409764073205, 0.3236067652571456, 0.320046638274, 0.3208178548769339, 0.31803280438210846, 0.32008092659040055, 0.3143640331861565, 0.2671699155951551, 0.2565608811336834, 0.2610187681328582, 0.24640176729076232, 0.2331043563802324, 0.2445713055723227, 0.23883051993280932, 0.23023281730260609, 0.23682344964957897, 0.23285277944124774, 0.24410878834286917, 0.22960094136263795]

validation_accuracy = [82.59493670886076, 81.33518133518133, 80.77055383556932, 81.73136167590881, 80.71351931330472, 80.06375674350171, 80.08421052631579, 82.392439691619, 79.46350043975374, 81.63696768347931, 80.7128580946036, 81.41714915908464]


In [None]:
print(len(training_accuracy))
print(len(training_loss))

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

ax1.plot(training_accuracy, label="Accuracy")
# ax1.axvline(x=12, color=(0.7, 0.7, 0.7), linestyle='--')
# ax1.axvline(x=24, color=(0.7, 0.7, 0.7), linestyle='--')
ax1.set_title("Accuracy")
# ax1.text(4, 88, "Epoch 1")
# ax1.text(16, 88, "Epoch 2")
# ax1.text(28, 84, "Epoch 3")

ax2.plot(training_loss, label="Loss")
# ax2.axvline(x=12, color=(0.7, 0.7, 0.7), linestyle='--')
# ax2.axvline(x=24, color=(0.7, 0.7, 0.7), linestyle='--')
ax2.set_title("Loss")
# ax2.text(4, 0.350, "Epoch 1")
# ax2.text(16, 0.375, "Epoch 2")
# ax2.text(28, 0.325, "Epoch 3")

fig.tight_layout()
plt.show()

In [None]:
final_epoch_a = training_accuracy[25:]
final_epoch_l = training_loss[25:]

avg_a = sum(final_epoch_a) / len(final_epoch_a)
print(avg_a)

avg_l = sum(final_epoch_l) / len(final_epoch_l)
print(avg_l)

In [None]:
import matplotlib.pyplot as plt

# Example data
values = [10, 8, 12, 15, 7]

# Create the bar plot
plt.bar(range(len(validation_accuracy)), validation_accuracy)

# Calculate the average
average = sum(validation_accuracy) / len(validation_accuracy)
print(average)

# Add a horizontal line at the average
plt.axhline(y=average, color='red', linestyle='-')
plt.text(12.10, 81, "Average Accuracy:")
plt.text(13.1, 76, "81.07")

# Set the title and axis labels
plt.title("Batch Accuracy of Validation Set")
plt.xlabel("Batch")
plt.ylabel("Accuracy")

# Show the plot
plt.show()

### Finding the Number of Examples used in Training and Testing for Sentence Classification Model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
folder_location = '/content/gdrive/MyDrive/Thesis/Data/TrainTestBinClass/'
file_numbers = ['1000', '2000', '3000', '4000', '5000', '5999', '7000', '8000', '9000', '10000', '11000', '12000']

list_of_training_files = []
list_of_testing_files = []

for number in file_numbers:
  train_file = folder_location+number+'_training_bdf.pickle'
  test_file = folder_location+number+'_testing_bdf.pickle'
  list_of_training_files.append(train_file)
  list_of_testing_files.append(test_file)

In [None]:
import pandas as pd

num_training_examples = 0
num_testing_examples = 0

for i in range(len(list_of_training_files)):

  train_df = pd.read_pickle(list_of_training_files[i])

  num_training_examples += sum(train_df['labels'])

  test_df = pd.read_pickle(list_of_testing_files[i])

  num_testing_examples += sum(test_df['labels'])

print(num_training_examples)
print(num_testing_examples)



### Manual Analysis of Training and Testing Accuracies for Batch Classification Model 

In [None]:
b_training_accuracy = [69.36797926181953, 72.18717139852787, 72.68702463389715, 73.50363000806668, 73.38378120717393, 74.0345937248592, 73.54048319759086, 75.15121589927169, 74.65168243953732, 75.50232716539902, 75.6224791610648, 76.94095215396533, 76.24698310539019, 76.44925056464308, 77.85458585359832, 77.66824395373291, 78.81711885571575, 78.9782199515999, 80.62920812727164, 80.13374899436846, 80.31620012319485]


b_training_loss = [0.5586259931489601, 0.5407131295396577, 0.5217077449310854, 0.5157782530749029, 0.5202178691111654, 0.5134308044832389, 0.5155074921376185, 0.49762741674332744, 0.5039870651932404, 0.48448406870220023, 0.4807426348934781, 0.4760561283261429, 0.48007301377298867,  0.4754189599704129, 0.45426987014275017, 0.45878450500266676, 0.4359722288790562, 0.4318719594891626, 0.4199761555632674, 0.4246197180977587, 0.40937716163714744]


b_validation_accuracy = [72.79684028634905, 73.39642481598318, 72.93984108967084, 72.91890729189073, 72.59294566253575, 73.43655741001407, 74.65097180399671]

In [None]:
print(len(b_training_accuracy))
print(len(b_training_loss))

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

ax1.plot(b_training_accuracy, label="Accuracy")
# ax1.axvline(x=12, color=(0.7, 0.7, 0.7), linestyle='--')
# ax1.axvline(x=24, color=(0.7, 0.7, 0.7), linestyle='--')
ax1.set_title("Accuracy")
# ax1.text(4, 88, "Epoch 1")
# ax1.text(16, 88, "Epoch 2")
# ax1.text(28, 84, "Epoch 3")

ax2.plot(b_training_loss, label="Loss")
# ax2.axvline(x=12, color=(0.7, 0.7, 0.7), linestyle='--')
# ax2.axvline(x=24, color=(0.7, 0.7, 0.7), linestyle='--')
ax2.set_title("Loss")
# ax2.text(4, 0.350, "Epoch 1")
# ax2.text(16, 0.375, "Epoch 2")
# ax2.text(28, 0.325, "Epoch 3")

fig.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt


# Create the bar plot
plt.bar(range(len(b_validation_accuracy)), b_validation_accuracy)

# Calculate the average
average = sum(b_validation_accuracy) / len(b_validation_accuracy)
print(average)

# Add a horizontal line at the average
plt.axhline(y=average, color='red', linestyle='-')
plt.text(6.9, 73, "Average Accuracy:")
plt.text(7.6, 68, "73.25")

# Set the title and axis labels
plt.title("Batch Accuracy of Validation Set")
plt.xlabel("Batch")
plt.ylabel("Accuracy")

# Show the plot
plt.show()

### Finding the Number of Examples used in Training and Testing for Batch Classification Model

In [None]:
folder_location = '/content/gdrive/MyDrive/Thesis/Data/BATCHClassTrainTest/'

file_numbers = ['2000', '4000', '6000', '8000', '16000', '18000', '19999']

list_of_training_files = []
list_of_testing_files = []

for number in file_numbers:
  train_file = folder_location+number+'_batch_training_bdf.pickle'
  test_file = folder_location+number+'_batch_testing_bdf.pickle'
  list_of_training_files.append(train_file)
  list_of_testing_files.append(test_file)


import pandas as pd

num_training_examples = 0
num_testing_examples = 0

for i in range(len(list_of_training_files)):

  train_df = pd.read_pickle(list_of_training_files[i])

  num_training_examples += sum(train_df['contains_summ'])

  test_df = pd.read_pickle(list_of_testing_files[i])

  num_testing_examples += sum(test_df['contains_summ'])

print(num_training_examples)
print(num_testing_examples)