In [1]:
# !pip install flair
import time
import string
import warnings

import regex as re
from tqdm import tqdm


import pickle
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.style as stl 



from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


warnings.filterwarnings("ignore")

In [2]:
documents = pd.read_csv("../input/skylarfreedman/Documents-Table 1.csv", on_bad_lines='skip', sep=";")

In [3]:
contract = documents[documents.Category == "Contracts"]
contract = contract[["document_id", "project_id", "transcription_text"]]

In [4]:
contract = contract[pd.notnull(contract['transcription_text'])]
contract = contract.reset_index(drop=True)

# Preprocessing

In [5]:
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation
stemmer = SnowballStemmer(language="english")
lemmatizer = WordNetLemmatizer()
stop_words.remove("of")
punct='!"#$%&\'()*+-/<=>?@[\\]^_`{|}~'

def process(s):
    """Removes stop words, words in brackets
    and parentheses, and special characters.
    """
    
    s= re.sub("([\(\[]).*?([\)\]])"," ",s)
    for p in punct:
        s = s.replace(p, '')
    s = word_tokenize(s)
    s = [w for w in s if not w in stop_words] #optional
    return s

In [6]:
contract["processed_text"] = contract["transcription_text"].apply(process)
contract["processed_text"] = contract["processed_text"].apply(lambda x: " ".join(x))

In [7]:
def parse_table_to_dict(example) :
    
    def clean_record(x) :
        if x.strip() == "":
            return " "
        return x.strip().replace("{", "").replace("}", "")

    table_dict = {}

    if "name" in example[0].lower():

        cols = []
        for col in example[0].split("|"):
            if col.strip() == "":
                cols.append(col)
            else:
                cols.append(col.strip().lower().replace(" ", "_").replace(".", ""))

        values = np.array([list(map(clean_record, i.split("|"))) for i in example[1:]])

    else:

        n_cols = len(example[0].split("|"))
        cols = [f"col_{i}" for i in range(n_cols)]

        values = np.array([list(map(clean_record, i.split("|"))) for i in example])
        
    
    #Remove empty rows (["----", "-----", "-----"])
    new_values = []
    for line in values:
        line_chars = list(set("".join([i.strip() for i in line])))
        
        if len(line_chars) != 0 and (line_chars[0] != "-" or len(line_chars) != 0):
            new_values.append(line.tolist())
            
    values = np.array(new_values)
            
    if values.reshape(-1).shape[0] != 0:
        for i, col in enumerate(cols):
            table_dict[col] = values[:, i].tolist()
            
        return table_dict
    
    else:
        
        return {} 

In [8]:
contract

Unnamed: 0,document_id,project_id,transcription_text,processed_text
0,NMAAHC-004567394_00861,11400,[D104 ENCL]\r\nNorth Carolina\r\nThis indentur...,North Carolina This indenture made entered Thi...
1,NMAAHC-004567394_00957,11400,[E 11 ENCL] \r\n\r\n\r\nState of North Carolin...,State of North Carolina Currituck County . Thi...
2,NMAAHC-004567394_00958,11400,"near Lizzie Etheridge's gate, thence South 56 ...","near Lizzie Etheridges gate , thence South 56 ..."
3,NMAAHC-004567395_00439,11406,Copy\r\n\r\nAgreement of Apprenticeship\r\nBu ...,Copy Agreement of Apprenticeship Bu Mrs. Kate ...
4,NMAAHC-004567395_00440,11406,[H 132 ENCL] \r\n\r\nCopy\r\n\r\nAgreement of ...,Copy Agreement of Apprenticeship . This agreem...
...,...,...,...,...
23724,NMAAHC-007677332_01525,41539,\N,N
23725,NMAAHC-007677332_01526,41539,\N,N
23726,NMAAHC-007677332_01527,41539,\N,N
23727,NMAAHC-007677332_01528,41539,Jo,Jo


In [9]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0

for table_text in contract_text:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")   
        
        condition_1 = len(separator_counts) == 0 or separator_count == separator_counts[-1]
        condition_2 = separator_count != 0
            
        #If you find the beginning of a table, take note
        if condition_1 and condition_2:
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0:
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        
    if streak > 0:
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            table = lines[pair[0]-1:pair[1]]
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
        
        
    idx += 1
        

In [10]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

4475


In [11]:
dcts = []
L=[]
nxt=[]
for i, example in enumerate(tables_extracted):
    if example != [] and len(example) > 0:
        L.append(example)
        dcts.append(parse_table_to_dict(example))
        nxt.append(parse_table_to_dict(example))
    else:
        dcts.append({})

In [12]:
l=[]
for i in tables_extracted:
    if len(i)>0:
        l.append(i)

In [13]:
len(l)

4475

In [14]:
nb=0
for example in(dcts):
    if len(example)  > 0 :
        nb=nb+1
print(nb)

4446


## Getting tables

In [15]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0

for table_text in contract_text:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")   
        
        condition_1 = len(separator_counts) == 0 or separator_count == separator_counts[-1]
        condition_2 = separator_count != 0
            
        #If you find the beginning of a table, take note ,condition_1 and
        if condition_1 and condition_2:
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0:
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        
    if streak > 0:
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            table = lines[pair[0]-1:pair[1]]
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
        
        
    idx += 1
        

In [16]:
idx

23729

In [17]:
nb=0
for i, example in enumerate(tables_extracted):
    if len(example)!= 0 :
        nb=nb+1
print(nb)

4475


In [18]:
dcts = []

for i, example in enumerate(tables_extracted):
    if example != [] and len(example) > 0:
        dcts.append(parse_table_to_dict(example))
    else:
        dcts.append({})
        

In [19]:
contract["tables_parsed"] = dcts

In [20]:
nb=0
for i, example in enumerate(dcts):
    if len(example)!= 0 :
        nb=nb+1
print(nb)

4446


In [21]:
for i in (contract[contract.tables_parsed != {}].sample(n=1).transcription_text.tolist()):

    print(i)
    print("\n==========\n")

182        183

Contracts 1868
[[6 column table across 2 pages]]

| Date | Employer | Employee |  County | Terms of Contract | Remarks|
|---|---|---|---|---|---|

| Feby 8 |I D   Langford | Moris Vallandingham Berry Rodnele (F) |Waren | Each hand has 15 Bush corn 150 lbs meat & one fourth crop. pay each for grains not over $5.00 | Filed   copy furnished  Langford Mrch 12/68

| Feby 8 | R.H Pynes | Sam Williams (F) Israel Williams (F) | Granville | One horse furnished by Pyne Pay is 1/2 expenses of the farm & divides the crop equally | Filed | 

| Feby 12 | James M. Vaughn | Warren Steed (F) | Waren | Warren furnishes himself & 2 others pays 1/2 [[?]] a/c. & 3 bbls corn & 800 lbs fodder to feeding horse  Crop divided equally. Fertilizers not to cost more that $10 | Filed | 

| Feby 13 | John J. Vaughn | Plummer Fritty (F) | Warner | Plummer furnishes himself & 1 other  has 1/2 the crop. pays 4 bbls corn 900 lbs fodder towards feeding horse Plummer has what he makes on a ce

In [22]:
t=[]
t.append("""[[9 column table]]
| Date of Contract. | Names. | Age. | No. of Classification. |  Names of Dependents. | Age. | Deduction for Depend'nts | Amount of Wages aftr deduction. | Remarks. |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
[[table is blank]]""")

In [23]:
t.append("""[[5 columned table]]
| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         
| --- | --- | --- | --- | --- | 
| 1861 | Phillip Carr | 13 | 100 | 00 |""")

In [24]:
t.append("""|NAMES.|AGE.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Frances her x mark|16|none|none.


DEPENDENTS.
|NAMES.|AGE.|NAMES.|AGE.|""")

In [25]:
t.append("""[[6 Columned Table]]
| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |
|---|---|---|---|---|---|
| Franklin | 21 | Male | 1st | $8.00 |
[[/6 columned table]]
""")
t.append("""[[3 columned table]]
| --- | --- | --- |
| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. 
M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | [[note]]

| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his crops |

| June 18 | Approved Contract between David S. Henry and Freedmen Parson Avery, Jackson Sanders and Reuben Sanders for farming dated June 6th 1866. | David S. Avery to furnish Lands Seeds and Teams. Parson Avery Jackson Sanders and Reuben Sanders freedmen to perform the manual labor necessary to make crops with said teams & Seeds and Lands furnished to keep up the fences and care properly for the stock and implements, and receive one third (1/3) of the crops. 
Sd. David S. Avery to receive two thirds (2/3) of the crops, and afford each of said Freedman a garden for Corn, Potatoes &c, &c rent free. | """)

In [26]:
t.append("""
[[5 column table]]

|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Margery X (her mark) Russell|30|female|none|none|

[[/5 column table]]

DEPENDENTS.

[[6 column table]]

|NAMES.|AGE.|SEX.|NAMES.|AGE.|SEX.|
|Phillis|9|female|   |   |   |
|Emma|6|female|   |   |   |

[[/6 column table]]""")
L.append("""[[4 Columned Table]]
| Names. | Age. | No. of Dependents. | Remarks. |
| --- |--- | --- | --- |
[[/ 4 Colummed Table]]""")

In [27]:
t2=t

In [28]:
len(t2)

6

In [29]:
t2[5]

'\n[[5 column table]]\n\n|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|\n|Margery X (her mark) Russell|30|female|none|none|\n\n[[/5 column table]]\n\nDEPENDENTS.\n\n[[6 column table]]\n\n|NAMES.|AGE.|SEX.|NAMES.|AGE.|SEX.|\n|Phillis|9|female|   |   |   |\n|Emma|6|female|   |   |   |\n\n[[/6 column table]]'

****

****

In [30]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extractedd = []
all_table_pairs = []
idx = 0

for table_text in t2:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")   
        
        condition_1 = len(separator_counts) == 0 or separator_count == separator_counts[-1]
        condition_2 = separator_count != 0
            
        #If you find the beginning of a table, take note ,condition_1 and
        if condition_1 and condition_2:
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0:
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        
    if streak > 0:
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            table = lines[pair[0]-1:pair[1]]
            tmp_tables.append(table)
            
        tables_extractedd.append(tmp_tables[0])
        
    else:
        tables_extractedd.append([])
        
        
    idx += 1
        

In [31]:
nb=0
for i, example in enumerate(tables_extractedd):
    if len(example) > 0 :
        nb=nb+1
print(nb)

4


In [32]:
t2[3]

'[[6 Columned Table]]\n| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |\n|---|---|---|---|---|---|\n| Franklin | 21 | Male | 1st | $8.00 |\n[[/6 columned table]]\n'

In [33]:
print(t2[0])

[[9 column table]]
| Date of Contract. | Names. | Age. | No. of Classification. |  Names of Dependents. | Age. | Deduction for Depend'nts | Amount of Wages aftr deduction. | Remarks. |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
[[table is blank]]


In [34]:
tables_extractedd[0]

["| Date of Contract. | Names. | Age. | No. of Classification. |  Names of Dependents. | Age. | Deduction for Depend'nts | Amount of Wages aftr deduction. | Remarks. |",
 '| --- | --- | --- | --- | --- | --- | --- | --- | --- |']

In [35]:
tables_extractedd

[["| Date of Contract. | Names. | Age. | No. of Classification. |  Names of Dependents. | Age. | Deduction for Depend'nts | Amount of Wages aftr deduction. | Remarks. |",
  '| --- | --- | --- | --- | --- | --- | --- | --- | --- |'],
 ['| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 [],
 ['| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|'],
 [],
 ['|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|',
  '|Margery X (her mark) Russell|30|female|none|none|']]

In [36]:
dcts = []

for i, example in enumerate(tables_extractedd):
    if example != [] and len(example) > 1:
        dcts.append(parse_table_to_dict(example))
    else:
        dcts.append({})
        

In [37]:
nb=0
for i, example in enumerate(dcts):
    if len(example) > 0 :
        nb=nb+1
print(nb)

4


In [38]:
dcts

[{'': [' '],
  'date_of_contract': ['---'],
  'names': ['---'],
  'age': ['---'],
  'no_of_classification': ['---'],
  'names_of_dependents': ['---'],
  "deduction_for_depend'nts": ['---'],
  'amount_of_wages_aftr_deduction': ['---'],
  'remarks': ['---']},
 {'': [' '],
  'no': ['---'],
  'names': ['---'],
  'age': ['---'],
  'rate_of_pay_per_month_dols': ['---'],
  'rate_of_pay_per_month_cents': ['---'],
  '         ': [' ']},
 {},
 {'': [' '],
  'name': ['---'],
  'age': ['---'],
  'sex': ['---'],
  'class': ['---'],
  'monthly_rate_of_wages': ['---'],
  'interest_in_profits': ['---']},
 {},
 {'': [' '],
  'names': ['Margery X (her mark) Russell'],
  'age': ['30'],
  'sex': ['female'],
  'monthly_rate_of_wages': ['none'],
  'interest_in_profits': ['none']}]

In [39]:
print(t2[5])


[[5 column table]]

|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Margery X (her mark) Russell|30|female|none|none|

[[/5 column table]]

DEPENDENTS.

[[6 column table]]

|NAMES.|AGE.|SEX.|NAMES.|AGE.|SEX.|
|Phillis|9|female|   |   |   |
|Emma|6|female|   |   |   |

[[/6 column table]]


In [40]:
for table_text in t2[:5]:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    liness = table_text.split("\n")
    lines = [i for i in liness if len(i.strip()) != 0]
    
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")   
        
        condition_1 = len(separator_counts) == 0 or separator_count == separator_counts[-1]
        condition_2 = separator_count != 0
        
        if condition_1 and condition_2:
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0:
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)

In [41]:
separator_counts

[0, 4, 3, 1, 4, 3, 1]

In [42]:
condition_2

True

In [43]:
table_text

"3 columned table\n| --- | --- | --- |\n| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. \nM.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | note\n\n| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his crops |\n\n| June

In [44]:
liness

['3 columned table',
 '| --- | --- | --- |',
 "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. ",
 'M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | note',
 '',
 "| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his c

In [45]:
lines

['3 columned table',
 '| --- | --- | --- |',
 "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. ",
 'M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | note',
 "| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his crops 