In [10]:
# libraries
import json
import lzma
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from IPython.core.display import display, HTML
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords  
import nltk
nltk.download('stopwords')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# pd.options.display.max_columns = 999
# pd.options.display.max_rows = 999

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qilongxin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# defining a fucnction to remove \n and HTML tags
# function adapted from https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/
stop_words = set(stopwords.words('english')) 
def text_cleaner(text):
    text_divided = text.splitlines()
    text_divided_clean = " ".join(text_divided)
    text_divided_clean = text_divided_clean.lower()
    text_divided_clean = re.sub('"','', text_divided_clean) # remove '"'
    text_divided_clean = re.sub(r"'s\b","",text_divided_clean) # remove ''s'
    text_divided_clean = re.sub("[^a-zA-Z]", " ", text_divided_clean) # removes all strings that contains a non-letter
    return text_divided_clean


# setting up tokenizer
tokenizer = RegexpTokenizer('\s+', gaps=True)

In [12]:
def get_data(state):
    # reading json files
    cases = []
    with lzma.open(state + '/data/data.jsonl.xz', 'r') as jsonl_file:
        for case in jsonl_file:
            cases.append(json.loads(str(case, 'utf-8')))

    df = pd.DataFrame(cases).sort_values('decision_date').reset_index(drop=True)
    df['decision_date'] = pd.to_datetime(df['decision_date'])

    # parsing data
    storage = []
    for i in range(df.shape[0]):
        casebody_idx = df.columns.get_loc("casebody")
        judges = df.iloc[i,casebody_idx]['data']['judges']
        attorneys = df.iloc[i,casebody_idx]['data']['attorneys']
        headnotes = df.iloc[i,casebody_idx]['data']['head_matter']
        if df.iloc[i,casebody_idx]['data']['opinions'] != []:
            opinions = df.iloc[i,casebody_idx]['data']['opinions'][0]['text']

#         headnotes_clean = text_cleaner(headnotes)
#         opinions_clean = text_cleaner(opinions)

        storage.append({'judges': judges,
                        'attorneys': attorneys,
                        'headnotes': headnotes,
                        'opinions': opinions})
    df_parsed = pd.DataFrame(storage)
    df = df_parsed.merge(df, left_index=True, right_index=True)

    # tokenizing headnotes and opinions
    df['headnotes_token'] = df['headnotes'].apply(lambda x: tokenizer.tokenize(x))
    df['opinions_token'] = df['opinions'].apply(lambda x: tokenizer.tokenize(x))
    df['headnotes_num_tokens'] = [len(notes) for notes in df['headnotes_token']]
    df['opinions_num_tokens'] = [len(opinions) for opinions in df['opinions_token']]

    return df

In [13]:
df_ar = get_data('Arkansas')

above_mean_summary = df_ar['headnotes_num_tokens'] > np.mean(df_ar.headnotes_num_tokens)
above_mean_text = df_ar['opinions_num_tokens'] > np.mean(df_ar.opinions_num_tokens)

df_ar_to_train = df_ar[above_mean_summary & above_mean_text]

In [14]:
train_size = 1000

train = df_ar_to_train.sample(n=train_size,replace=False, random_state=1)
x_train,y_train = train.opinions.tolist(),train.headnotes.tolist()

In [15]:
# df_ar_to_train.to_pickle('df_ar_to_train.pkl')
from summarizer import Summarizer
#https://github.com/dmmiller612/bert-extractive-summarizer
import rouge 

model = Summarizer()
rouge = rouge.Rouge()

scores = np.zeros(len(x_train))
for i,text in enumerate(x_train): 
    result = model(text, min_length=60)
    full = ''.join(result)
    sc = rouge.get_scores(y_train[i], full)
    scores[i] = sc[0]['rouge-1']['f'] 
print(np.mean(scores))

0.3318770278665225
