In [1]:
# import packages
import os
from glob import glob
import pandas as pd
import numpy as np
import re
import nltk

## MODIFY THIS
# get path to your folder that holds the txt files
source_files = "C:/Users/jacqu/Downloads/Court Case PDFs/Court Case TXTs"
# outputs a list of all the txt files in the folder
source_file_list = sorted(glob(f"{source_files}/*.txt"))

# creates a list of tuples with an elememt for the source path and
# for the file title
file_data = []
for source_file_path in source_file_list:
    # split might be different, recommend checking with INFO.sample() or .head()
    file_title = source_file_path.split('\\')[-1].split(".txt")[0]
    file_data.append((source_file_path, file_title))

# creating df with the file title as the index and source path as a col
INFO = pd.DataFrame(file_data, columns=['txt_path','file_title'])\
    .set_index('file_title').sort_index()
# attempt at dropping any duplicate files with same file name
# this only works if same file has the SAME NAME
# See Notes below
INFO = INFO[~INFO.index.duplicated(keep='first')]

**Notes** I envision the drop dups code piece to be helpful in the case that a file is downloaded on two different environments (hence, they will have the same name) and then the code is run. This will NOT take care of dups that have DIFFERENT file names...

In [2]:
INFO.sample(10)

Unnamed: 0_level_0,txt_path
file_title,Unnamed: 1_level_1
"A.D. v. Choice Hotels Int_l, Inc., 2023 U.S. Dist. LEXIS 150380",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Mahee, 2023 U.S. Dist. LEXIS 216654",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Williams v. Sisolak, 2023 U.S. App. LEXIS 32338",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"People v. Pitcher, 2017 Cal. App. Unpub. LEXIS 2838",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
People v. Lamb_ 37 N.Y.3d 1174,C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"State v. Davis, 2023-Ohio-4389",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Samsung Fire _ Marine Ins. Co., Ltd. v. UFVS Mgmt. Co., LLC, 2023 U.S. Dist. LEXIS 46508",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"State v. Jose Y., 2023 N.J. Super. Unpub. LEXIS 2234",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Paul, 2020 U.S. Dist. LEXIS 72619",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Solis v. Okeechobee Shooting Sports, LLC, 2023 U.S. Dist. LEXIS 216887",C:/Users/jacqu/Downloads/Court Case PDFs/Court...


In [3]:
INFO.size

77

In [4]:
# opening the file in read mode 
my_file = open(INFO.txt_path[0], "r") 
# reading the file 
narrative = my_file.read() 

### Making df with sentence number as index

In [5]:
SENTS = nltk.sent_tokenize(narrative)
SENTS[10:20]

['On June 13, 2023,  Defendant Summit Hotel TRS 085, LLC ("Summit  Hotel") filed a motion to dismiss Plaintiff\'s complaint in  its entirety with [*2]  prejudice for failure to state a claim.',
 'ECF 17 at 1.',
 "In the alternative, Defendant moves to  strike portions of Plaintiff's complaint or the complaint in  its entirety.",
 'Id.',
 'at 1-2.',
 'Plaintiff filed a Response in  Opposition on June 27, 2023, ECF 22, and Defendant  filed a Reply on July 11, 2023, ECF 25.',
 'On August 10,  2023, Plaintiff filed a Notice of Supplemental Authority.',
 'ECF 29.',
 "Before this Court is Defendant Summit Hotel's Motion to  Dismiss.",
 'ECF 17.']

In [40]:
df = pd.DataFrame()
df['sent_str'] = nltk.sent_tokenize(narrative)
df.sent_str = df.sent_str.str.strip()
df.index.name = "sent_num"
df.sample(10)

Unnamed: 0_level_0,sent_str
sent_num,Unnamed: 1_level_1
239,18 U.S.C.
54,Plaintiff's Trafficking at the Residence Inn ...
119,Id.
182,To state a financial beneficiary claim under S...
214,"Plaintiff alleges that ""[t]here was a continu..."
73,These signs of sex trafficking included Plai...
274,at Â¶ 93.
79,"LEXIS 143289, *2 Page 3 of 6 Inn Portland empl..."
289,"Accordingly, this Court GRANTS Defendants' mot..."
71,Id.


In [58]:
def NE_dict(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    
    named_entities_dict = {}
    for entity in chunked:
        if isinstance(entity, nltk.tree.Tree):
            entity_label = entity.label()
            entity_text = " ".join([word for word, tag in entity.leaves()])
            named_entities_dict[entity_label] = [entity_text]
    
    return named_entities_dict

In [70]:
entities_dict = {}
for i in df.sent_str:
    more_entities_dict = NE_dict(i)
    # Merge dictionaries while dropping repeats in values
    entities_dict = {key: list(set(entities_dict.get(key, []) + more_entities_dict.get(key, []))) for key in set(entities_dict) | set(more_entities_dict)}

print(entities_dict['PERSON'])
print(entities_dict['ORGANIZATION'])

['Advanced Textile', 'Defendant', 'Mindgeek USA', 'Starr', 'Civil', 'Shroyer', 'Summit Hotel', 'Plaintiff', 'Howell', 'Hilton Worldwide Holdings', 'Boehm', 'Reddit', 'Red Roof Inns', 'Iqbal', 'District Judge', 'Residence Inn Portland Airport', 'Baca', 'Educ', 'Resorts', 'Twombly']
['TVPRA', 'Supplemental Authority', 'Shilo Inn Salem', 'Residence Inn Portland', 'DISMISS', 'Bell Atl', 'ECF', 'LEGAL', 'Sex', 'WHO', 'Ninth Circuit', 'Defendant', 'LLC', 'Inn Portland Airport', 'Federal Rule', 'Defendant Summit Hotel', 'Defendants', 'XXIII']


### Making df with two indices: sentence number and token number

In [7]:
# df.sent_str.apply(lambda x: x.split())

df1 = df.sent_str\
.apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)), dtype='object'))\
.stack().to_frame("token_pos")
df1.index.names = ["sent_num", "token_num"]
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,token_pos
sent_num,token_num,Unnamed: 2_level_1
0,0,"(OPINION, NN)"
0,1,"(AND, CC)"
0,2,"(ORDER, NNP)"
0,3,"(GRANTING, NNP)"
0,4,"(DEFENDANT, NNP)"


In [8]:
#df['token_str'] = 
df1['token_str'] = df1.token_pos.apply(lambda x: x[0].strip())
df1['term_str'] = df1.token_pos.apply(lambda x: x[0].lower().strip())
TOKEN = df1
df1['pos_tag'] = df1.token_pos.apply(lambda x: x[1])
df1 = df1.drop(columns="token_pos")
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,token_str,term_str,pos_tag
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,OPINION,opinion,NN
0,1,AND,and,CC
0,2,ORDER,order,NNP
0,3,GRANTING,granting,NNP
0,4,DEFENDANT,defendant,NNP


**Note:** Making separate df with POS tags just incase...

### Making function for getting NGram models from narrative

In [9]:
def get_ngrams(TOKEN, n=2, sent_key='sent_num'):
    
    OHCO = TOKEN.index.names
    grouper = list(OHCO)[:OHCO.index(sent_key)+1]

    PADDED = TOKEN.groupby(grouper)\
        .apply(lambda x: '<s> ' + ' '.join(x.term_str) + ' </s>')\
        .apply(lambda x: pd.Series(x.split()))\
        .stack().to_frame('term_str')
    PADDED.index.names = grouper + ['token_num']

    NGRAMS = PADDED.groupby(grouper)\
        .apply(lambda x: pd.concat([x.shift(0-i) for i in range(n)], axis=1)).reset_index(drop=True)
    NGRAMS.index = PADDED.index
    NGRAMS.columns = [f'w{j}' for j in range(n)]
    
    return NGRAMS

In [10]:
ngrams = 3
widx = [f"w{i}" for i in range(ngrams)]

In [11]:
def ngrams_to_models(ngrams):
    global widx
    n = len(ngrams.columns)
    model = [None for i in range(n)]
    for i in range(n):
        if i == 0:
            model[i] = ngrams.value_counts('w0').to_frame('n')
            model[i]['p'] = model[i].n / model[i].n.sum()
            model[i]['i'] = np.log2(1/model[i].p)
        else:
            model[i] = ngrams.value_counts(widx[:i+1]).to_frame('n')    
            model[i]['cp'] = model[i].n / model[i-1].n
            model[i]['i'] = np.log2(1/model[i].cp)
    return model

**NGram example with the narrative for sentence number 3 (index 2)**

In [19]:
NG3 = get_ngrams(TOKEN, n=3)
NG3.loc[2].head()

Unnamed: 0_level_0,w0,w1,w2
token_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,<s>,"(""plaintiff"")",filed
1,"(""plaintiff"")",filed,a
2,filed,a,complaint
3,a,complaint,against
4,complaint,against,defendants


In [22]:
M3 = ngrams_to_models(NG3)
tri = M3[2].sort_values('n')
tri.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
this,case.,</s>,1,1.0,0.0
2023,plaintiff,filed,1,1.0,0.0
21,f.4th,714,1,1.0,0.0
214,f.3d,1058,1,1.0,0.0
2012,through,march,1,1.0,0.0


**Testing for words associated with "sex" and "trafficking"**

In [25]:
tri.query('w1 == "sex"').sort_values('n', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
of,sex,trafficking,11,0.846154,0.241008
commercial,sex,activity,8,0.615385,0.70044
in,sex,trafficking.,7,0.636364,0.652077
in,sex,trafficking,3,0.272727,1.874469
the,sex,trafficking,3,0.6,0.736966
being,sex,trafficked,2,1.0,0.0
plaintiff's,sex,trafficking.,1,0.5,1.0
the,sex,"trade""",1,0.2,2.321928
the,sex,acts,1,0.2,2.321928
plaintiff's,sex,trafficking,1,0.5,1.0


In [26]:
tri.query('w1 == "trafficking"').sort_values('n', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sex,trafficking,and,4,0.210526,2.247928
sex,trafficking,at,3,0.157895,2.662965
sex,trafficking,industry,2,0.105263,3.247928
sex,trafficking,occurring,2,0.105263,3.247928
sex,trafficking,as,2,0.105263,3.247928
sex,trafficking,included,1,0.052632,4.247928
sex,trafficking,or,1,0.052632,4.247928
the,trafficking,venture.,1,0.5,1.0
sex,trafficking,by,1,0.052632,4.247928
sex,trafficking,in,1,0.052632,4.247928


In [29]:
tri.query('w1 == "forced"').sort_values('n', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"sex,",forced,"prostitution,",1,1.0,0.0


In [35]:
tri.query('w1 == "minor"').sort_values('n', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,minor,or,1,1.0,0.0


In [12]:
# list(df1.term_str.values)

In [13]:
# set(df1.term_str.values)