# Imports and Installs

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install jaro-winkler

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rap

In [None]:
import pandas as pd
import numpy as np

import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import jaro


# Utils/Helper Functions

In [None]:
#gives average document similarity for model
#gives the similarity average across each feature
#store scores in a matrix
#add sorting and all for a


def validate(df1, df2, metric):

  matrix = []

  if len(df1) != len(df2): #check if DFs are unequal, in this case overlapping docs will be extracted
    print('Unequal DFs')

  else:
    for row in range(0,len(df1)): #for each row (doc) in the DFs
      row_vector = [] #to store row scores (doc)
      for cell1, cell2 in zip(list(df1.iloc[row]), list(df2.iloc[row])): #for each corresponding cell (feature entity) in a document
        cell1, cell2 = str(cell1), str(cell2) #string conversion for N/A handling
        score = metric(cell1, cell2) #calculating score between each corresponding entity
        row_vector.append(score) #appending score to the row_vector (holding similarity for all features in a doc)
      matrix.append(row_vector) #appending vector to the matrix of scores


  matrix = np.array(matrix) #converting to np array
  overall = np.mean(matrix) #overall score

  document_means = [np.mean(row) for row in matrix] # collecting means bw each doc
  document_dict = dict(zip(df1['Document_Name'], document_means)) #conversion to dict
  document_df = pd.DataFrame.from_dict(document_dict, orient='index', columns=['Score']).reset_index() #document df
  document_df.columns = ['Document', 'Score'] #renaming cols


  feature_means = [np.mean(col) for col in matrix.T] #collecting means bw each feature across doc
  feature_dict = dict(zip(df1.columns, feature_means)) #conversion to dict
  feature_df = pd.DataFrame.from_dict(feature_dict, orient='index', columns=['Score']).reset_index() #feature df
  feature_df.columns = ['Feature', 'Score'] #renaming cols



  return matrix, document_df, feature_df, overall #returning all




In [None]:
matrix, document_df, feature_df, overall = validate(df_4o, df_4o_mini, fuzz.partial_ratio)
matrix

array([[100, 100, 100, 100, 100, 100, 100, 100, 100, 100,  99,  88, 100],
       [100, 100, 100, 100, 100, 100, 100,  30, 100, 100,  97,  79, 100],
       [100,   0,   0, 100, 100, 100, 100, 100, 100, 100,  94,  93, 100],
       [100,   0,  25,   0,   0,   0,   0,  32, 100,  80,  90,  96, 100],
       [100, 100, 100, 100, 100,   0, 100, 100, 100, 100,  99, 100, 100],
       [100,   0, 100,   0, 100, 100, 100, 100, 100, 100, 100,  61, 100],
       [100, 100, 100, 100, 100, 100, 100, 100,  25, 100, 100,  86, 100],
       [ 86,   0,   0, 100, 100,   0,   0,  19, 100, 100, 100,  70, 100],
       [  0,   0,  33, 100, 100,   0,  33,  16, 100, 100, 100,  69, 100],
       [ 89,  40,  25, 100, 100,  40,  17, 100, 100, 100,  99, 100, 100],
       [100, 100, 100,  25,  14, 100,  17,  45, 100, 100, 100,  68, 100],
       [ 78, 100, 100, 100, 100, 100, 100, 100, 100, 100,  99, 100, 100],
       [100, 100, 100, 100, 100, 100, 100,  32, 100, 100, 100,  94, 100],
       [100,   0,  67, 100, 100,  50, 

In [None]:
document_df

Unnamed: 0,Document,Score
0,4201768_0001.tif,99.0
1,4204010_0001.tif,92.769231
2,4302425_0001.tif,83.615385
3,4202463_0001.tif,47.923077
4,4306466_0001.tif,92.230769
5,4302583_0001.tif,81.615385
6,4306964_0001.tif,93.153846
7,4205888_0001.tif,59.615385
8,48167_4000820_0001.tif,57.769231
9,4004912_0001.tif,77.692308


In [None]:
feature_df

Unnamed: 0,Feature,Score
0,Date,89.24
1,Borrower_First,65.6
2,Borrower_Last,73.68
3,Second_Borrower_First,86.32
4,Second_Borrower_Last,88.56
5,Other_Party_First,65.48
6,Other_Party_Last,66.24
7,Lending_Bank,68.88
8,Interest_Rate,91.0
9,Loan_Amount,94.4


In [None]:
df = pd.DataFrame.from_dict(feature_means, orient='index', columns=['Score']).reset_index()
df

Unnamed: 0,index,Score
0,Date,89.36
1,Borrower_First,63.12
2,Borrower_Last,66.4
3,Second_Borrower_First,85.72
4,Second_Borrower_Last,88.56
5,Other_Party_First,57.88
6,Other_Party_Last,59.44
7,Lending_Bank,67.88
8,Interest_Rate,92.8
9,Loan_Amount,89.04


In [None]:
list(df_4o.iloc[0])

['28/10/1941',
 'Palmer',
 'Haynes',
 'Ilene',
 'Haynes',
 'R. M.',
 'Orth',
 'THE TEXAS CITY NATIONAL BANK',
 '4.5%',
 '$2,750.00',
 "The East Twenty-five Feet (E-25') of Lot numbered. Four (4) and all of Lot Five (5), Block numbered Three Hundred Seventy-nine (379), Texas City Fourth bivision, according to plat thereof recorded in Book 238, Page 21, of the records of Galveston County, Texas.",
 'Monthly payments of $21.04, including interest, commencing on May 1, 1942, and continuing until the principal and interest are fully paid, with the final payment due on November 1, 1957.',
 '4201768_0001.tif']

In [None]:
matrix[:, 1]

array([ 80, 100,   0,   0, 100,   0, 100,   0,   0,  31, 100, 100, 100,
         0, 100, 100, 100, 100, 100, 100,  67, 100, 100,   0,   0])

# Loading CSVs

In [None]:
#df_llama= pd.read_csv("/content/llama_doctr.csv")


df_4o = pd.read_csv("/content/4o_doctr.csv")
df_4o_confidence = pd.read_csv("/content/df_4o_confidence_csv.csv")
df_4o_confidence_all = pd.read_csv("/content/df_4o_confidence_all_csv.csv")

df_4o_mini= pd.read_csv("/content/4o_mini_doctr.csv")
df_4o_mini_confidence = pd.read_csv("/content/df_4o_mini_confidence_csv.csv")
df_4o_mini_confidence_all = pd.read_csv("/content/output_4o_mini_confidence_all_csv.csv")


df_mistral = pd.read_csv("/content/mistral_doctr.csv")
df_mistral_confidence = pd.read_csv("/content/output_mistral_confidence_csv.csv")

df_kate = pd.read_csv("/content/kate_llm.csv")

df_newline = pd.read_csv("/content/newline_4o_mini_csv.csv")

In [None]:
df_4o_two_page = pd.read_csv("/content/twopage_df_4o_confidence_final_csv.csv")
df_4o_two_page_refinal= pd.read_csv("/content/twopage_df_4o_confidence_refinal_csv.csv")
df_4o_two_page_re_refinal= pd.read_csv("/content/twopage_df_4o_confidence_re_refinal_csv.csv")

df_4o_mini_two_page = pd.read_csv("/content/twopage_df_4o_mini_confidence_final_csv.csv")
df_4o_mini_two_page_refinal= pd.read_csv("/content/twopage_df_4o_mini_confidence_refinal_csv.csv")

df_mistral_two_page = pd.read_csv("/content/twopage_df_mistral_confidence_final_csv.csv")

df_kate = pd.read_csv("/content/kate_llm.csv")


## Two Mortgage Feature Validation

In [None]:
ground_truth = pd.read_csv('/content/ground_truth_two_mortgages.csv')
ground_truth = ground_truth[['Document', 'Two_Mortgage']]

In [None]:
ground_truth

Unnamed: 0,Document,Two_Mortgage
0,4004046_0001.tif,No
1,4004575_0001.tif,Yes
2,4004912_0001.tif,Yes
3,4005318_0001.tif,Yes
4,4005800_0001.tif,Yes
5,4005813_0001.tif,Yes
6,4104165_0001.tif,No
7,4104592_0001.tif,No
8,4104649_0001.tif,No
9,4105565_0001.tif,No


In [None]:
preds_4o = df_4o_two_page[['Document_Name', 'Two_Mortgages']]
preds_4o_mini = df_4o_mini_two_page[['Document_Name', 'Two_Mortgages']]
preds_mistral = df_mistral_two_page[['Document_Name', 'Two_Mortgages']]

In [None]:
preds_4o = preds_4o.sort_values(by = ['Document_Name']).reset_index(drop=True)
preds_4o_mini = preds_4o_mini.sort_values(by = ['Document_Name']).reset_index(drop=True)
preds_mistral = preds_mistral.sort_values(by = ['Document_Name']).reset_index(drop=True)
ground_truth = ground_truth.sort_values(by = ['Document']).reset_index(drop=True)

In [None]:
comparision_df = pd.concat([preds_4o.Document_Name,
                            preds_4o_mini.Document_Name,
                            preds_mistral.Document_Name,
                            ground_truth.Document,
                            preds_4o.Two_Mortgages,
                            preds_4o_mini.Two_Mortgages,
                            preds_mistral.Two_Mortgages,
                            ground_truth.Two_Mortgage], axis = 1)

comparision_df.columns = ['Doc1', 'Doc2', 'Doc3','Doc4', '4o', '4o_Mini','Mistral', 'True']
comparision_df

Unnamed: 0,Doc1,Doc2,Doc3,Doc4,4o,4o_Mini,Mistral,True
0,4004046_0001.tif,4004046_0001.tif,4004046_0001.tif,4004046_0001.tif,No,No,No,No
1,4004575_0001.tif,4004575_0001.tif,4004575_0001.tif,4004575_0001.tif,No,No,No,Yes
2,4004912_0001.tif,4004912_0001.tif,4004912_0001.tif,4004912_0001.tif,Yes,Yes,Yes,Yes
3,4005318_0001.tif,4005318_0001.tif,4005318_0001.tif,4005318_0001.tif,No,No,No,Yes
4,4005800_0001.tif,4005800_0001.tif,4005800_0001.tif,4005800_0001.tif,No,No,No,Yes
5,4005813_0001.tif,4005813_0001.tif,4005813_0001.tif,4005813_0001.tif,No,Yes,No,Yes
6,4104165_0001.tif,4104165_0001.tif,4104165_0001.tif,4104165_0001.tif,No,No,No,No
7,4104592_0001.tif,4104592_0001.tif,4104592_0001.tif,4104592_0001.tif,No,No,No,No
8,4104649_0001.tif,4104649_0001.tif,4104649_0001.tif,4104649_0001.tif,No,No,No,No
9,4105565_0001.tif,4105565_0001.tif,4105565_0001.tif,4105565_0001.tif,No,No,No,No


In [None]:
comparision_df = comparision_df[['Doc1', '4o', '4o_Mini', 'Mistral', 'True']]
comparision_df

Unnamed: 0,Doc1,4o,4o_Mini,Mistral,True
0,4004046_0001.tif,No,No,No,No
1,4004575_0001.tif,No,No,No,Yes
2,4004912_0001.tif,Yes,Yes,Yes,Yes
3,4005318_0001.tif,No,No,No,Yes
4,4005800_0001.tif,No,No,No,Yes
5,4005813_0001.tif,No,Yes,No,Yes
6,4104165_0001.tif,No,No,No,No
7,4104592_0001.tif,No,No,No,No
8,4104649_0001.tif,No,No,No,No
9,4105565_0001.tif,No,No,No,No


In [None]:
correct_4o = len([item1 for item1, item2 in zip(comparision_df['4o'], comparision_df['True']) if item1 == item2])
correct_4o_mini = len([item1 for item1, item2 in zip(comparision_df['4o_Mini'], comparision_df['True']) if item1 == item2])
correct_mistral = len([item1 for item1, item2 in zip(comparision_df['Mistral'], comparision_df['True']) if item1 == item2])



accuracy_4o = correct_4o/25
accuracy_4o_mini = correct_4o_mini/25
accuracy_mistral = correct_mistral/25

total_positive = len([item for item in comparision_df['True'] if item =='Yes'])
total_negative = len([item for item in comparision_df['True'] if item =='No'])



tp_4o = len([item1 for item1, item2 in zip(comparision_df['4o'], comparision_df['True']) if item2 == 'Yes' and item1 == item2])
tp_4o_mini = len([item1 for item1, item2 in zip(comparision_df['4o_Mini'], comparision_df['True']) if item2 == 'Yes' and item1 == item2])
tp_mistral = len([item1 for item1, item2 in zip(comparision_df['Mistral'], comparision_df['True']) if item2 == 'Yes' and item1 == item2])


fp_4o = len([item1 for item1, item2 in zip(comparision_df['4o'], comparision_df['True']) if item2 == 'No' and item1 != item2])
fp_4o_mini = len([item1 for item1, item2 in zip(comparision_df['4o_Mini'], comparision_df['True']) if item2 == 'No' and item1 != item2])
fp_mistral = len([item1 for item1, item2 in zip(comparision_df['Mistral'], comparision_df['True']) if item2 == 'No' and item1 != item2])


tpr_4o = tp_4o/total_positive
tpr_4o_mini = tp_4o_mini/total_positive
tpr_mistral = tp_mistral/total_positive

fpr_4o = fp_4o/total_negative
fpr_4o_mini = fp_4o_mini/total_negative
fpr_mistral = fp_mistral/total_negative






In [None]:
correct_mistral

18

In [None]:
print(accuracy_4o)
print(accuracy_4o_mini)
print(accuracy_mistral)

0.84
0.84
0.72


In [None]:
print(tpr_4o)
print(tpr_4o_mini)
print(tpr_mistral)



0.5
0.5
0.125


In [None]:
print(fpr_4o)
print(fpr_4o_mini)
print(fpr_mistral)



0.0
0.0
0.0


In [None]:
precision_4o = tpr_4o/tpr_4o+fpr_4o
precision_4o_mini = tpr_4o_mini/tpr_4o_mini+fpr_4o_mini
precision_mistral = tpr_mistral/tpr_mistral+fpr_mistral

print(precision_4o)
print(precision_4o_mini)
print(precision_mistral)

1.0
1.0
1.0


Total Positive - 8 (5 Strong, 3 Mild)

4o - 4 correct (3 Strong, 1 Mild) - 50% TPR, 60% Strong

4o_Mini 4 correct (4 Strong) - 50% TPR, 80% Strong

#Feature Validation

In [None]:
columns = list(df_4o_confidence.columns)
columns.remove('Confidence_Score')

In [None]:
df_4o_two_page

Unnamed: 0,Date,Borrower_First,Borrower_Last,Second_Borrower_First,Second_Borrower_Last,Other_Party_First,Other_Party_Last,Lending_Bank,Interest_Rate,Loan_Amount,Location_for_Mortgage,Payment_Plan,Overall_Confidence_Score,Two_Mortgages,Document_Name
0,,,,,,J. L.,MARTIN INVESTMENT COMPANY,"FIRST MORTGAGE COMPANY OF HOUSTON, INC.",4.5%,$3900.00,"Lot Number Seven (7), in Block Number Four Hun...","Monthly installments of $21.68 each, including...",0.8,No,4307375_0001.tif
1,,,,,,J.G.,Estwood,"First Mortgage Company of Houston, Inc.",4.5%,$3550.00,"Lot Number Eight (8), in Block Number Four Hun...","Monthly installments of $19.74 each, including...",0.8,No,4307340_0001.tif
2,1/3/1935,L. B.,Berndt,Mary Ellen,Berndt,Wm. S.,Bradley,Mortgage Investment Corporation,4.5%,$4600.00,"Lot Nineteen (19) of Palm Gardens, in the City...","Monthly installments of $25.58, including inte...",0.9,No,48167_4000820_0001.tif
3,6/4/1942,George,Clayton,,,R. M.,Orth,Texas City National Bank,4.5%,"$3,000.00","Lots One (1), Two (2), and Three (3), Block nu...","Monthly payments of $22.95, including interest...",0.9,No,4202463_0001.tif
4,,Clarence Francis,Quinn,Kathleen,Quinn,B. B.,Yeager,Gulf Coast Investment Corporation,4.5%,"$4,400.00","Lots Seven (7), Eight (8) and Nine (9) in Bloc...","Monthly installments of $24.46 each, including...",0.9,No,4104649_0001.tif
5,,,,,,J.E.,Aosashe Jr.,"J. E. FOSTER & SON, INC.",4.5%,"$3,950.00","The East twenty feet of Lot Four (4), and the ...","Monthly installments of $21.96 each, including...",0.8,No,4302425_0001.tif
6,,,,,,,,"First Mortgage Company of Houston, Inc.",4.5%,"$4,950.00",Blocks Lot Thirty-five (35) in Block Three (3)...,Monthly installments of Twenty-seven and 52/10...,0.6,No,4305776_0001.tif
7,,R. V.,MOCAULEY,,,R. M.,ORTH,TEXAS CITY NATIONAL BANK,4.5%,"$3,250.00","Lots numbered Thirteen (13) and Fourteen (14),...","Monthly installments of $20.57, commencing on ...",0.9,No,4203323_0001.tif
8,3/8/1942,Elmer Douglas,Laiche,Edna Pearl,Laiche,R. M.,Orth,The Texas National Bank,4%,$3000.00,Lot Number Four (4) and the adjoining West one...,"Monthly payments of $22.95, including interest...",0.9,No,4204010_0001.tif
9,21/03/1942,Palmer T.,Haynes,Ilene,Haynes,R. M.,Orth,The Texas City National Bank,4.5%,"$2,750.00",The East Twenty-five Feet (E-25') of Lot numbe...,"Monthly payments of $21.04, including interest...",0.9,No,4201768_0001.tif


In [None]:
columns = list(df_4o_two_page.columns)
columns.remove('Overall_Confidence_Score')
columns.remove('Two_Mortgages')

In [None]:
columns = list(df_4o_two_page.columns)
columns.remove('Confidence_Score')
columns.remove('Two_Mortgages')

In [None]:
columns= ['Date', 'Borrower_First', 'Borrower_Last',
       'Second_Borrower_First', 'Second_Borrower_Last', 'Other_Party_First',
       'Other_Party_Last', 'Lending_Bank', 'Interest_Rate', 'Loan_Amount',
       'Location_for_Mortgage', 'Payment_Plan',
       'Document_Name']

# base_keys_no_conf = ['Date', 'Borrower_First', 'Borrower_Last',
#        'Second_Borrower_First', 'Second_Borrower_Last', 'Other_Party_First',
#        'Other_Party_Last', 'Lending_Bank', 'Interest_Rate', 'Loan_Amount',
#        'Location_for_Mortgage', 'Payment_Plan',
#        'Document_Name']

In [None]:
df_kate = df_kate[columns]


df_4o_confidence_all = df_4o_confidence_all[columns]
df_4o_confidence = df_4o_confidence[columns]
df_4o=df_4o[columns]


df_4o_mini_confidence_all = df_4o_mini_confidence_all[columns]
df_4o_mini_confidence = df_4o_mini_confidence[columns]
df_4o_mini = df_4o_mini[columns]

df_mistral_confidence = df_mistral_confidence[columns]



In [None]:
df_kate = df_kate[columns]

df_4o_two_page = df_4o_two_page[columns]
df_4o_two_page_refinal = df_4o_two_page_refinal[columns]
df_4o_two_page_re_refinal = df_4o_two_page_re_refinal[columns]

df_4o_mini_two_page = df_4o_mini_two_page[columns]
df_4o_mini_two_page_refinal= df_4o_mini_two_page_refinal[columns]

df_mistral_two_page = df_mistral_two_page[columns]

df_newline = df_newline[columns]

In [None]:
df_kate_sorted = df_kate.sort_values(by = ['Document_Name'])

df_4o_two_page_sorted = df_4o_two_page.sort_values(by = ['Document_Name'])
df_4o_two_page_sorted_refinal = df_4o_two_page_refinal.sort_values(by = ['Document_Name'])
df_4o_two_page_sorted_re_refinal = df_4o_two_page_re_refinal.sort_values(by = ['Document_Name'])

df_4o_mini_two_page_sorted = df_4o_mini_two_page.sort_values(by = 'Document_Name')
df_4o_mini_two_page_sorted_refinal = df_4o_mini_two_page_refinal.sort_values(by = 'Document_Name')

df_mistral_two_page_sorted = df_mistral_two_page.sort_values(by = ['Document_Name'])

df_newline = df_newline.sort_values(by = ['Document_Name'])
#need this cause different order

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_two_page_sorted, df_kate_sorted, fuzz.token_sort_ratio)
print(f'Performance for 4o: {overall}')
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

Performance for 4o: 81.24307692307693
                  Feature   Score
0                    Date   50.40
1          Borrower_First   58.32
2           Borrower_Last   91.60
3   Second_Borrower_First   90.16
4    Second_Borrower_Last   92.00
5       Other_Party_First   66.84
6        Other_Party_Last   67.60
7            Lending_Bank   97.76
8           Interest_Rate   84.16
9             Loan_Amount   94.56
10  Location_for_Mortgage   93.68
11           Payment_Plan   69.08
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
16,4302583_0001.tif,48.461538,4302583_0001.tif,Yes
2,4004912_0001.tif,62.692308,4004912_0001.tif,Yes
14,4205888_0001.tif,69.538462,4205888_0001.tif,No
9,4105565_0001.tif,71.384615,4105565_0001.tif,No
18,4306466_0001.tif,71.846154,4306466_0001.tif,Yes
7,4104592_0001.tif,73.076923,4104592_0001.tif,No
17,4305776_0001.tif,73.153846,4305776_0001.tif,No
20,4306964_0001.tif,74.538462,4306964_0001.tif,No
23,4307375_0001.tif,78.538462,4307375_0001.tif,No
6,4104165_0001.tif,81.0,4104165_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_two_page_sorted_refinal, df_kate_sorted, fuzz.token_sort_ratio)
print(f'Performance for 4o: {overall}')
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

Performance for 4o: 84.41538461538461
                  Feature   Score
0                    Date   53.36
1          Borrower_First   73.00
2           Borrower_Last   78.12
3   Second_Borrower_First   90.16
4    Second_Borrower_Last   93.44
5       Other_Party_First   88.84
6        Other_Party_Last   93.72
7            Lending_Bank   94.96
8           Interest_Rate   80.72
9             Loan_Amount   93.76
10  Location_for_Mortgage   92.24
11           Payment_Plan   65.08
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
2,4004912_0001.tif,38.384615,4004912_0001.tif,Yes
18,4306466_0001.tif,63.230769,4306466_0001.tif,Yes
16,4302583_0001.tif,75.230769,4302583_0001.tif,Yes
6,4104165_0001.tif,81.0,4104165_0001.tif,No
15,4302425_0001.tif,82.461538,4302425_0001.tif,No
19,4306607_0001.tif,82.461538,4306607_0001.tif,Yes
14,4205888_0001.tif,82.615385,4205888_0001.tif,No
17,4305776_0001.tif,84.230769,4305776_0001.tif,No
9,4105565_0001.tif,85.153846,4105565_0001.tif,No
7,4104592_0001.tif,85.153846,4104592_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_two_page_sorted_re_refinal, df_kate_sorted, fuzz.token_sort_ratio)
print(f'Performance for 4o: {overall}')
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

Performance for 4o: 87.32615384615384
                  Feature   Score
0                    Date   54.40
1          Borrower_First   82.84
2           Borrower_Last   79.84
3   Second_Borrower_First   93.60
4    Second_Borrower_Last   95.84
5       Other_Party_First   93.32
6        Other_Party_Last   96.92
7            Lending_Bank   97.76
8           Interest_Rate   84.16
9             Loan_Amount   94.56
10  Location_for_Mortgage   93.08
11           Payment_Plan   68.92
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
18,4306466_0001.tif,63.230769,4306466_0001.tif,Yes
16,4302583_0001.tif,76.538462,4302583_0001.tif,Yes
6,4104165_0001.tif,81.615385,4104165_0001.tif,No
19,4306607_0001.tif,82.461538,4306607_0001.tif,Yes
7,4104592_0001.tif,84.538462,4104592_0001.tif,No
20,4306964_0001.tif,84.692308,4306964_0001.tif,No
0,4004046_0001.tif,85.384615,4004046_0001.tif,No
22,4307353_0001.tif,86.615385,4307353_0001.tif,No
9,4105565_0001.tif,86.923077,4105565_0001.tif,No
13,4204010_0001.tif,87.0,4204010_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_two_page_sorted, df_kate_sorted, fuzz.token_sort_ratio)
print(f'Performance for 4o-Mini: {overall}')
document_df
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

Performance for 4o-Mini: 82.74153846153847
                  Feature   Score
0                    Date   33.36
1          Borrower_First   70.16
2           Borrower_Last   64.52
3   Second_Borrower_First   92.68
4    Second_Borrower_Last   93.24
5       Other_Party_First   90.36
6        Other_Party_Last   96.92
7            Lending_Bank   97.76
8           Interest_Rate   84.16
9             Loan_Amount   95.32
10  Location_for_Mortgage   78.24
11           Payment_Plan   78.92
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
16,4302583_0001.tif,63.230769,4302583_0001.tif,Yes
22,4307353_0001.tif,71.076923,4307353_0001.tif,No
23,4307375_0001.tif,74.846154,4307375_0001.tif,No
21,4307340_0001.tif,76.0,4307340_0001.tif,No
18,4306466_0001.tif,77.153846,4306466_0001.tif,Yes
12,4203323_0001.tif,77.692308,4203323_0001.tif,No
14,4205888_0001.tif,77.692308,4205888_0001.tif,No
19,4306607_0001.tif,77.769231,4306607_0001.tif,Yes
6,4104165_0001.tif,81.0,4104165_0001.tif,No
17,4305776_0001.tif,81.461538,4305776_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_two_page_sorted_refinal, df_kate_sorted, fuzz.token_sort_ratio)
print(f'Performance for 4o-Mini: {overall}')
document_df
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

Performance for 4o-Mini: 84.11076923076924
                  Feature   Score
0                    Date   41.36
1          Borrower_First   72.52
2           Borrower_Last   66.60
3   Second_Borrower_First   94.48
4    Second_Borrower_Last   96.00
5       Other_Party_First   90.36
6        Other_Party_Last   97.32
7            Lending_Bank   97.76
8           Interest_Rate   84.16
9             Loan_Amount   95.32
10  Location_for_Mortgage   78.28
11           Payment_Plan   79.28
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
0,4004046_0001.tif,75.230769,4004046_0001.tif,No
21,4307340_0001.tif,76.692308,4307340_0001.tif,No
15,4302425_0001.tif,77.076923,4302425_0001.tif,No
18,4306466_0001.tif,77.153846,4306466_0001.tif,Yes
19,4306607_0001.tif,77.769231,4306607_0001.tif,Yes
9,4105565_0001.tif,78.307692,4105565_0001.tif,No
22,4307353_0001.tif,78.461538,4307353_0001.tif,No
16,4302583_0001.tif,78.615385,4302583_0001.tif,Yes
12,4203323_0001.tif,80.0,4203323_0001.tif,No
6,4104165_0001.tif,81.307692,4104165_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_mistral_two_page_sorted, df_kate_sorted, fuzz.token_sort_ratio)
print(f'Performance for Mistral: {overall}')
print(feature_df)
print(document_df)



Performance for Mistral: 78.79692307692308
                  Feature   Score
0                    Date   22.60
1          Borrower_First   58.80
2           Borrower_Last   66.00
3   Second_Borrower_First   89.20
4    Second_Borrower_Last   92.60
5       Other_Party_First   86.52
6        Other_Party_Last   89.44
7            Lending_Bank   95.88
8           Interest_Rate   83.00
9             Loan_Amount   91.32
10  Location_for_Mortgage   84.80
11           Payment_Plan   64.20
12          Document_Name  100.00
                  Document      Score
0         4004046_0001.tif  81.307692
1         4004575_0001.tif  72.769231
2         4004912_0001.tif  91.769231
3         4005318_0001.tif  80.692308
4         4005800_0001.tif  91.692308
5         4005813_0001.tif  85.923077
6         4104165_0001.tif  81.153846
7         4104592_0001.tif  83.692308
8         4104649_0001.tif  90.461538
9         4105565_0001.tif  78.692308
10        4201768_0001.tif  92.230769
11        4202463_0001.ti

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_confidence_all, df_kate, fuzz.token_sort_ratio)
print(f'4o Overall Score {overall}')
document_df
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

#add deed of trust to prompt or add FHA form number to tell a model where it starts. via prompt eng
# saw output for this
# - add partial doc feature
#fall back to overall confidence approach
#ocr input must be 2 documents - have a directory within directory structure
#made changes to ocr functions for this
#then made model produce output - did sorting an all of dfs by columns to make sure they align
#this all this + validation metrics of new feature with a groud truth set and used
#the ground truth set for comparing 1 page vs 2 page approach




4o Overall Score 74.6676923076923
                  Feature   Score
0                    Date   44.44
1          Borrower_First   49.04
2           Borrower_Last   91.08
3   Second_Borrower_First   84.24
4    Second_Borrower_Last   88.72
5       Other_Party_First   75.48
6        Other_Party_Last   80.52
7            Lending_Bank   81.60
8           Interest_Rate   59.28
9             Loan_Amount   87.12
10  Location_for_Mortgage   81.16
11           Payment_Plan   48.00
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
9,4004912_0001.tif,37.769231,4004912_0001.tif,Yes
5,4302583_0001.tif,42.153846,4302583_0001.tif,Yes
16,4306607_0001.tif,54.923077,4306607_0001.tif,Yes
4,4306466_0001.tif,61.615385,4306466_0001.tif,Yes
19,4305776_0001.tif,65.615385,4305776_0001.tif,No
1,4204010_0001.tif,67.153846,4204010_0001.tif,No
6,4306964_0001.tif,68.384615,4306964_0001.tif,No
7,4205888_0001.tif,70.846154,4205888_0001.tif,No
8,48167_4000820_0001.tif,73.538462,48167_4000820_0001.tif,No
21,4005800_0001.tif,73.538462,4005800_0001.tif,Yes


In [None]:
matrix, document_df, feature_df, overall = validate(df_mistral_confidence, df_kate, fuzz.token_sort_ratio)
print(f'Mistral Overall Score {overall}')
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')

Mistral Overall Score 72.24
                  Feature   Score
0                    Date   17.48
1          Borrower_First   53.12
2           Borrower_Last   61.12
3   Second_Borrower_First   86.40
4    Second_Borrower_Last   87.92
5       Other_Party_First   78.52
6        Other_Party_Last   90.48
7            Lending_Bank   81.52
8           Interest_Rate   65.80
9             Loan_Amount   89.24
10  Location_for_Mortgage   78.88
11           Payment_Plan   48.64
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
19,4305776_0001.tif,28.153846,4305776_0001.tif,No
16,4306607_0001.tif,36.846154,4306607_0001.tif,Yes
4,4306466_0001.tif,54.538462,4306466_0001.tif,Yes
5,4302583_0001.tif,58.846154,4302583_0001.tif,Yes
6,4306964_0001.tif,59.538462,4306964_0001.tif,No
24,4307375_0001.tif,70.384615,4307375_0001.tif,No
23,4307353_0001.tif,70.461538,4307353_0001.tif,No
9,4004912_0001.tif,70.923077,4004912_0001.tif,Yes
2,4302425_0001.tif,72.307692,4302425_0001.tif,No
1,4204010_0001.tif,72.307692,4204010_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini, df_kate, fuzz.token_sort_ratio)
print(overall)
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')


73.13230769230769
                  Feature   Score
0                    Date   36.44
1          Borrower_First   50.04
2           Borrower_Last   76.44
3   Second_Borrower_First   86.24
4    Second_Borrower_Last   86.32
5       Other_Party_First   61.84
6        Other_Party_Last   72.64
7            Lending_Bank   82.52
8           Interest_Rate   64.40
9             Loan_Amount   90.88
10  Location_for_Mortgage   80.32
11           Payment_Plan   62.64
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
9,4004912_0001.tif,35.076923,4004912_0001.tif,Yes
5,4302583_0001.tif,41.538462,4302583_0001.tif,Yes
16,4306607_0001.tif,46.076923,4306607_0001.tif,Yes
4,4306466_0001.tif,47.0,4306466_0001.tif,Yes
7,4205888_0001.tif,61.769231,4205888_0001.tif,No
24,4307375_0001.tif,63.769231,4307375_0001.tif,No
6,4306964_0001.tif,72.384615,4306964_0001.tif,No
11,4005813_0001.tif,72.538462,4005813_0001.tif,Yes
17,4004046_0001.tif,72.846154,4004046_0001.tif,No
21,4005800_0001.tif,73.538462,4005800_0001.tif,Yes


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_confidence, df_kate, fuzz.token_sort_ratio)
print(overall)
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')


72.43384615384615
                  Feature   Score
0                    Date   32.44
1          Borrower_First   53.72
2           Borrower_Last   68.12
3   Second_Borrower_First   83.36
4    Second_Borrower_Last   86.68
5       Other_Party_First   67.92
6        Other_Party_Last   75.40
7            Lending_Bank   84.52
8           Interest_Rate   66.96
9             Loan_Amount   88.80
10  Location_for_Mortgage   73.60
11           Payment_Plan   60.12
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
9,4004912_0001.tif,34.769231,4004912_0001.tif,Yes
16,4306607_0001.tif,45.692308,4306607_0001.tif,Yes
5,4302583_0001.tif,46.0,4302583_0001.tif,Yes
4,4306466_0001.tif,47.0,4306466_0001.tif,Yes
3,4202463_0001.tif,54.846154,4202463_0001.tif,No
24,4307375_0001.tif,59.0,4307375_0001.tif,No
21,4005800_0001.tif,71.384615,4005800_0001.tif,Yes
19,4305776_0001.tif,71.923077,4305776_0001.tif,No
11,4005813_0001.tif,72.538462,4005813_0001.tif,Yes
10,4105565_0001.tif,73.923077,4105565_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_confidence_all, df_kate, fuzz.token_sort_ratio)
print(overall)
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')


70.43384615384615
                  Feature   Score
0                    Date   24.44
1          Borrower_First   52.56
2           Borrower_Last   80.12
3   Second_Borrower_First   80.44
4    Second_Borrower_Last   82.20
5       Other_Party_First   63.76
6        Other_Party_Last   71.72
7            Lending_Bank   82.52
8           Interest_Rate   62.12
9             Loan_Amount   86.80
10  Location_for_Mortgage   71.80
11           Payment_Plan   57.16
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
9,4004912_0001.tif,35.076923,4004912_0001.tif,Yes
16,4306607_0001.tif,45.692308,4306607_0001.tif,Yes
4,4306466_0001.tif,47.461538,4306466_0001.tif,Yes
13,4307340_0001.tif,50.076923,4307340_0001.tif,No
5,4302583_0001.tif,56.076923,4302583_0001.tif,Yes
7,4205888_0001.tif,61.461538,4205888_0001.tif,No
23,4307353_0001.tif,64.230769,4307353_0001.tif,No
2,4302425_0001.tif,64.307692,4302425_0001.tif,No
6,4306964_0001.tif,65.846154,4306964_0001.tif,No
17,4004046_0001.tif,69.615385,4004046_0001.tif,No


In [None]:
matrix, document_df, feature_df, overall = validate(df_newline, df_kate_sorted, fuzz.token_sort_ratio)
print(overall)
print(feature_df)
pd.merge(document_df, comparision_df[['Doc1', 'True']], how = 'inner', left_on = ['Document'], right_on = 'Doc1').sort_values(by = 'Score')


78.80307692307693
                  Feature   Score
0                    Date   40.44
1          Borrower_First   70.48
2           Borrower_Last   68.68
3   Second_Borrower_First   87.44
4    Second_Borrower_Last   89.92
5       Other_Party_First   92.36
6        Other_Party_Last   96.92
7            Lending_Bank   84.52
8           Interest_Rate   66.96
9             Loan_Amount   91.92
10  Location_for_Mortgage   76.80
11           Payment_Plan   58.00
12          Document_Name  100.00


Unnamed: 0,Document,Score,Doc1,True
19,4306607_0001.tif,55.384615,4306607_0001.tif,Yes
18,4306466_0001.tif,55.923077,4306466_0001.tif,Yes
16,4302583_0001.tif,58.384615,4302583_0001.tif,Yes
2,4004912_0001.tif,70.461538,4004912_0001.tif,Yes
4,4005800_0001.tif,71.384615,4005800_0001.tif,Yes
5,4005813_0001.tif,72.538462,4005813_0001.tif,Yes
21,4307340_0001.tif,73.923077,4307340_0001.tif,No
23,4307375_0001.tif,74.846154,4307375_0001.tif,No
22,4307353_0001.tif,78.461538,4307353_0001.tif,No
9,4105565_0001.tif,81.076923,4105565_0001.tif,No


In [None]:
df_newline['Document_Name']

Unnamed: 0,Document_Name
21,4004046_0001.tif
13,4004575_0001.tif
18,4004912_0001.tif
22,4005318_0001.tif
2,4005800_0001.tif
15,4005813_0001.tif
3,4104165_0001.tif
6,4104592_0001.tif
20,4104649_0001.tif
0,4105565_0001.tif


In [None]:
df_kate['Document_Name']

Unnamed: 0,Document_Name
0,4201768_0001.tif
1,4204010_0001.tif
2,4302425_0001.tif
3,4202463_0001.tif
4,4306466_0001.tif
5,4302583_0001.tif
6,4306964_0001.tif
7,4205888_0001.tif
8,48167_4000820_0001.tif
9,4004912_0001.tif


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_confidence, df_kate, fuzz.token_sort_ratio)
print(overall)
feature_df

67.39142857142858


Unnamed: 0,Feature,Score
0,Date,32.44
1,Borrower_First,53.72
2,Borrower_Last,68.12
3,Second_Borrower_First,83.36
4,Second_Borrower_Last,86.68
5,Other_Party_First,67.92
6,Other_Party_Last,75.4
7,Lending_Bank,84.52
8,Interest_Rate,66.96
9,Loan_Amount,88.8


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_confidence_all, df_kate, fuzz.token_sort_ratio)
print(overall)
feature_df

70.43384615384615


Unnamed: 0,Feature,Score
0,Date,24.44
1,Borrower_First,52.56
2,Borrower_Last,80.12
3,Second_Borrower_First,80.44
4,Second_Borrower_Last,82.2
5,Other_Party_First,63.76
6,Other_Party_Last,71.72
7,Lending_Bank,82.52
8,Interest_Rate,62.12
9,Loan_Amount,86.8


In [None]:
matrix

array([[100, 100, 100,  91, 100, 100, 100, 100,  86, 100, 100,  87, 100],
       [100, 100,  31, 100,  18, 100, 100,  90, 100,  80, 100,  94, 100],
       [  0, 100,   0, 100, 100, 100, 100, 100,  86, 100,  89,  91, 100],
       [100, 100, 100, 100, 100, 100, 100, 100,  86, 100, 100,  91, 100],
       [  0,   6,  36, 100, 100, 100, 100,  10,   0,  77,  97,   2, 100],
       [  0, 100,   0,   0,   0, 100, 100, 100,  86, 100, 100,  85, 100],
       [  0,   0,   0, 100, 100,  29,  29, 100,  86, 100, 100,  91, 100],
       [  0, 100,  44, 100, 100, 100, 100, 100,  86,  93, 100,  90, 100],
       [  0, 100, 100, 100, 100, 100, 100, 100,  86,  93, 100,  91, 100],
       [ 63, 100, 100, 100, 100, 100, 100,  29,   0,  80,  73,   2, 100],
       [  0, 100,  17, 100, 100, 100, 100, 100,  86,  93,  97,  93, 100],
       [  0, 100, 100, 100, 100, 100, 100, 100,   0, 100, 100,   2, 100],
       [  0, 100, 100, 100, 100,  67, 100, 100,  86,  93, 100,  91, 100],
       [  0, 100,  44, 100, 100, 100, 

In [None]:
df_mistral_confidence

Unnamed: 0,Date,Borrower_First,Borrower_Last,Second_Borrower_First,Second_Borrower_Last,Other_Party_First,Other_Party_Last,Lending_Bank,Interest_Rate,Loan_Amount,Location_for_Mortgage,Payment_Plan,Document_Name
0,28/10/1941,Palmer,Haynes,Ilene,Haynes,R. M.,Orth,THE TEXAS CITY NATIONAL BANK,4.5%,"$2,750.00",The East Twenty-five Feet (E-25') of Lot numbe...,"Monthly payments of $21.04, including interest...",4201768_0001.tif
1,22/4/1942,Elmer,Douglas,Edna,Pearl,R.M.,Orth,THE TEXAS NATIONAL BANK,4%,$3000.00,Lot Number Four (4) and the adjoining West one...,"Monthly payments of $22.95, including interest...",4204010_0001.tif
2,1/1/1943,Home,Builders,,,J.E.,Foster,"J. E. FOSTER & SON, INC.",4.5%,"$3,950.00","The East twenty feet of Lot Four (4), and the ...","Monthly payments of $21.96, including interest...",4302425_0001.tif
3,24/12/1941,George,Clayton,,,R. M.,Orth,THE TEXAS CITY NATIONAL BANK,4.5%,"$3,000.00","Lots One (1), Two (2), and Three (3), Block nu...","Monthly payments of $22.95, including interest...",4202463_0001.tif
4,3/11/1943,J. D.,Townsend,,,J. G.,Hestwood,,,$10.00,"Lots Noe. Nine (9) and Ten (10), in Block No. ...",,4306466_0001.tif
5,19/1/1942,W. E.,White,Louise,Cook,T. J.,Bettes,T. J. BETTES COMPANY,4.5%,"$11,700.00",The East seventeen (17) feet and six (6) inche...,"Monthly payments of $65.05, including interest...",4302583_0001.tif
6,1/1/1960,J,Foster,,,Home,Builders,"J. E.Foster & Son, Inc.",4.5%,"$3,950.00",Lot Ten (10) and the East twenty feet (20') of...,"Monthly payments of $21.96, including interest...",4306964_0001.tif
7,2/1/42,J. L.,MARTIN,,,J. G.,HESTWOOD,"FIRST MORTGAGE COMPANY OF HOUSTON, INC.",4.5%,$3850.00,"Lots Nos. Three (3) and Four (4), in Block No....","Monthly payments of $21.41, including interest...",4205888_0001.tif
8,1/1/1935,L. B.,Berndt,Mary Ellen,Berndt,Wm. S.,Bradley,Mortgage Investment Corporation,4.5%,$4600.00,"Lot Nineteen (19) of Palm Gardens, in the City...","Monthly payments of $25.58, including interest...",48167_4000820_0001.tif
9,16/09/1940,Kermit,Agee,Margaret,Agee,R. M.,Orth,FHA - For Use Under Title I Class 3 Loans on L...,,$2000.00,Lot Three (3) in Block One Hundred Forty-six (...,,4004912_0001.tif


In [None]:
df_kate

Unnamed: 0,Date,Borrower_First,Borrower_Last,Second_Borrower_First,Second_Borrower_Last,Other_Party_First,Other_Party_Last,Lending_Bank,Interest_Rate,Loan_Amount,Location_for_Mortgage,Payment_Plan,Document_Name
0,10/28/1941,Palmer T.,Haynes,Ilenes,Haynes,R. M.,Orth,Texas City National Bank,4.50%,"$2,750.00",The East Twenty-five Feet (E-25') of Lot numbe...,monthly installments of twenty one & 04/100 do...,4201768_0001.tif
1,4/22/1942,Elmer Douglas,Laiche,Edna Pearl,Laiche,R. M.,Orth,Texas City National Bank,4.50%,"$3,000.00",Lot Number Four (4) and the adjoining West one...,monthly installments of twenty two & 95/100 do...,4204010_0001.tif
2,,"Home Builders, Inc.",,,,J.E.,Foster,"J.E. Foster & Son, Inc.",4.50%,"$3,950.00","The East twenty (20) feet of Lot Three (3), al...",monthly installments of Twenty-one and 96/100 ...,4302425_0001.tif
3,12/24/1941,George L.,Clayton,,,R. M.,Orth,Texas City National Bank,4.50%,"$3,000.00","Lots One (1), Two (2), and Three (3), Block nu...",monthly installments of twenty-two & 95/100 Do...,4202463_0001.tif
4,,Townsend Construction Company,,,,J.G.,Hestwood,"First Mortgage Company of Houston, Inc.",4.50%,"$4,100.00","Lots Nos. Nine (9) and Ten (10), in Block No. ...",monthly installments of Twenty-two and 80/100 ...,4306466_0001.tif
5,,"W.E. White, Inc.",,,,T. J.,Bettes,T. J. Bettes Company,4.50%,"$11,700.00",The East seventeen (17) feet and six (6) inche...,monthly installments of sixty-five and 05/100 ...,4302583_0001.tif
6,,"Home Builders, Inc.",,,,J.E.,Foster,"J.E. Foster & Son, Inc.",4.50%,"$3,950.00",Lot Ten (10) and the East twenty feet (20') of...,monthly installments of Twenty-one and 96/100 ...,4306964_0001.tif
7,,J. L. Martin Investment Company,,,,J.G.,Hestwood,"First Mortgage Company of Houston, Inc.",4.50%,"$3,850.00","Lots Nos. Three (3) and Four (4), in Block No....",monthly installments of Twenty-one and 41/100 ...,4205888_0001.tif
8,,L.B.,Berndt,Mary Ellen,Berndt,Wm. S.,Bradley,Mortgage Investment Corporation,4.50%,"$4,600.00","Lot Nineteen (19) of Palm Gardens, in the City...",monthly installments of twenty-five and 58/100...,48167_4000820_0001.tif
9,9/12/1940,Kermit E.,Agee,Margaret,Agee,R. M.,Orth,Texas City National Bank,4.50%,"$2,400.00",The East one-half (E 1/2) of Lot numbered Six ...,monthly installments of eighteen & 36/100 Doll...,4004912_0001.tif


In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_confidence, df_4o_confidence_all, fuzz.token_sort_ratio)
overall

86.79714285714286

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini_confidence, df_4o_mini_confidence_all, fuzz.token_sort_ratio)
overall

86.48857142857143

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o, df_4o_confidence_all[columns[:-1]], fuzz.token_sort_ratio)
overall

86.59076923076923

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini, df_4o_mini_confidence_all[columns[:-1]], fuzz.token_sort_ratio)
overall

89.73846153846154

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o, df_4o_confidence[columns[:-1]], fuzz.token_sort_ratio)
overall

87.92307692307692

In [None]:
matrix, document_df, feature_df, overall = validate(df_4o_mini, df_4o_mini_confidence[columns[:-1]], fuzz.token_sort_ratio)
overall


91.05230769230769

In [None]:
text1 = ''
text2 = "Name .. is my Uday"

token_set_ratio = fuzz.token_set_ratio(text1, text2)
token_set_ratio

100

# Ratio Playground

1. **Ratio** - Levenstein Distance


2. **Partial Ratio** - Substring Matching (If one is a substring of the larger then 100)

3. **Token_Sort_Ratio** - Texts are tokenized and then alphabetically sorted. Then Levenstein distance is computed.

4. **Token_Set_Ratio** - Tokenization + Partial String Matching + Levenstein (So tokenize, preprocess, find common, then calculate Levenstein).


In [None]:
ratio = fuzz.ratio(df_4o['Borrower_First'][0], df_4o_mini['Borrower_First'][0])
ratio

#had to make two moves from a set of (add, delete, subsitute)

80

In [None]:
df_4o['Borrower_First'][0]

'Palmer'

In [None]:
df_4o_mini['Borrower_First'][0]

'Palmer T.'

In [None]:
partial_ratio = fuzz.partial_ratio(df_4o['Borrower_First'][0], df_4o_mini['Borrower_First'][0])
partial_ratio

#found matching string so 100

100

In [None]:
token_sort_ratio = fuzz.token_sort_ratio(df_4o['Borrower_First'][0], df_4o_mini['Borrower_First'][0])
token_sort_ratio

#preprocessing probably removed the . so more than ratio

86

In [None]:
token_set_ratio = fuzz.token_set_ratio(df_4o['Borrower_First'][0], df_4o_mini['Borrower_First'][0])
token_set_ratio

#tokenization + partial gives 100 again

100