https://practicaldatascience.co.uk/machine-learning/how-to-use-count-vectorization-for-n-gram-analysis#google_vignette

In [15]:
import os
from glob import glob
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
## MODIFY THIS
# get path to your folder that holds the txt files
source_files = "C:/Users/jacqu/Downloads/Court Case PDFs/Court Case TXTs"
# outputs a list of all the txt files in the folder
source_file_list = sorted(glob(f"{source_files}/*.txt"))

# creates a list of tuples with an elememt for the source path and
# for the file title
file_data = []
for source_file_path in source_file_list:
    # split might be different, recommend checking with INFO.sample() or .head()
    file_title = source_file_path.split('\\')[-1].split(".txt")[0]
    file_data.append((source_file_path, file_title))

# creating df with the file title as the index and source path as a col
INFO = pd.DataFrame(file_data, columns=['txt_path','file_title'])\
    .set_index('file_title').sort_index()
# attempt at dropping any duplicate files with same file name
# this only works if same file has the SAME NAME
# See Notes below
INFO = INFO[~INFO.index.duplicated(keep='first')]

In [3]:
INFO.sample(10)

Unnamed: 0_level_0,txt_path
file_title,Unnamed: 1_level_1
"People v. Mahjoob, 2022 Cal. App. Unpub. LEXIS 2073",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Crim. Frederik Barbieri, 2023 U.S. Dist. LEXIS 217647",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Raniere, 55 F.4th 354",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Rice Enters., LLC v. RSUI Indem. Co., 2023 U.S. Dist. LEXIS 217212",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Hertzfeld v. Hertzfeld, 2023-Ohio-4411",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Wilkins, 538 F. Supp. 3d 49",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Gil, 2023 U.S. Dist. LEXIS 217887",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"State v. Taylor-Hollingsworth, 2023 Ohio App. LEXIS 4249",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"State v. Nelson, 2023 Iowa App. LEXIS 941",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"In re A.F., 2023-Ohio-4423",C:/Users/jacqu/Downloads/Court Case PDFs/Court...


In [4]:
INFO.index[0]

'A.B. v. Shilo Inn, Salem, LLC, 2023 U.S. Dist. LEXIS 143289'

In [5]:
len(INFO)

77

In [6]:
narratives_list = []
for doc_idx, txt_path in enumerate(INFO['txt_path']):
    with open(txt_path, 'r',  encoding='utf-8') as file:
        narrative = file.read()
    narratives_list.append({"title": INFO.index[doc_idx], "narrative": narrative})

# Convert the list of dictionaries to a DataFrame
narratives = pd.DataFrame(narratives_list)

In [7]:
narratives.head()

Unnamed: 0,title,narrative
0,"A.B. v. Shilo Inn, Salem, LLC, 2023 U.S. Dist....",OPINION AND ORDER GRANTING DEFENDANT SUMMIT H...
1,"A.D. v. Best Western Int_l, Inc., 2023 U.S. Di...",OPINION AND ORDER This matter comes before the...
2,"A.D. v. Choice Hotels Int_l, Inc., 2023 U.S. D...",OPINION AND ORDER This matter comes before the...
3,B.M. v. Wyndham Hotels,ORDER GRANTING IN PART AND DENYING IN PART DE...
4,"Bacon v. Marshall, 2023 U.S. App. LEXIS 32309",[*1] ORDER AND JUDGMENT* _____________________...


In [12]:
model = CountVectorizer(ngram_range = (2, 2), max_features = 100, stop_words='english')
matrix = model.fit_transform(narratives.narrative).toarray()
feature_names = model.get_feature_names_out()
df_output = pd.DataFrame(data = matrix, columns = feature_names)
df_output.T.tail(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
trafficking victims,1,1,1,2,0,4,0,0,0,2,...,3,0,0,0,6,1,0,0,0,0
trial court,0,0,0,0,0,0,0,24,0,0,...,0,0,8,0,2,0,0,0,0,75
united states,2,3,3,3,4,9,1,0,0,3,...,29,17,94,1,85,10,4,3,3,14
unknown john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,73,0,0,0,0
years old,1,0,0,1,0,0,0,0,0,0,...,4,0,3,0,2,0,0,0,0,0


In [16]:
# opening the file in read mode 
my_file = open(INFO.txt_path[0], "r") 
# reading the file 
narrative = my_file.read() 

In [17]:
df = pd.DataFrame()
df['sent_str'] = nltk.sent_tokenize(narrative)
df.sent_str = df.sent_str.str.strip()
df.index.name = "sent_num"
df.sample(10)

Unnamed: 0_level_0,sent_str
sent_num,Unnamed: 1_level_1
268,ECF 1 at Â¶ 84.
124,Due to Plaintiff's status as a victim [*8] o...
228,"For example, Plaintiff plausibly alleges the ..."
57,"While at the Residence Inn Portland, Plaintif..."
244,"at Â¶Â¶ 49, 51."
88,Id.
115,Plaintiff references three online reviews of ...
286,"Finally, although Plaintiff highlights variou..."
5,ECF 1.
245,"While Plaintiff was with a ""buyer,"" Plaintiff'..."


In [18]:
model = CountVectorizer(ngram_range = (2, 2), max_features = 100, stop_words='english')
matrix = model.fit_transform(df.sent_str).toarray()
feature_names = model.get_feature_names_out()
df_output = pd.DataFrame(data = matrix, columns = feature_names)
df_output.T.tail(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,280,281,282,283,284,285,286,287,288,289
trafficking occurring,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
trafficking residence,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
trs 085,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tvpra claim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
venture engaged,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
