In [1]:
# !pip install flair
import time
import string
import warnings

import regex as re
from tqdm import tqdm


import pickle
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.style as stl 



from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


warnings.filterwarnings("ignore")

In [2]:
documents = pd.read_csv("../input/skylarfreedman/Documents-Table 1.csv", on_bad_lines='skip', sep=";")

In [3]:
contract = documents[documents.Category == "Contracts"]
contract = contract[["document_id", "project_id", "transcription_text"]]

In [4]:
contract = contract[pd.notnull(contract['transcription_text'])]
contract = contract.reset_index(drop=True)

# Preprocessing

In [5]:
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation
stemmer = SnowballStemmer(language="english")
lemmatizer = WordNetLemmatizer()
stop_words.remove("of")
punct='!"#$%&\'()*+-/<=>?@[\\]^_`{|}~'

def process(s):
    """Removes stop words, words in brackets
    and parentheses, and special characters.
    """
    
    s= re.sub("([\(\[]).*?([\)\]])"," ",s)
    for p in punct:
        s = s.replace(p, '')
    s = word_tokenize(s)
    s = [w for w in s if not w in stop_words] #optional
    return s

In [6]:
contract["processed_text"] = contract["transcription_text"].apply(process)
contract["processed_text"] = contract["processed_text"].apply(lambda x: " ".join(x))

In [7]:
def parse_table_to_dict(example) :
    
    def clean_record(x) :
        if x.strip() == "":
            return " "
        return x.strip().replace("{", "").replace("}", "")

    table_dict = {}

    if "name" in example[0].lower():

        cols = []
        for col in example[0].split("|"):
            if col.strip() == "":
                cols.append(col)
            else:
                cols.append(col.strip().lower().replace(" ", "_").replace(".", ""))

        values = np.array([list(map(clean_record, i.split("|"))) for i in example[1:]])

    else:

        n_cols = len(example[0].split("|"))
        cols = [f"col_{i}" for i in range(n_cols)]

        values = np.array([list(map(clean_record, i.split("|"))) for i in example])
        
    
    #Remove empty rows (["----", "-----", "-----"])
    new_values = []
    for line in values:
        line_chars = list(set("".join([i.strip() for i in line])))
        
        if len(line_chars) != 0 and (line_chars[0] != "-" or len(line_chars) != 0):
            new_values.append(line.tolist())
            
    values = np.array(new_values)
            
    if values.reshape(-1).shape[0] != 0:
        for i, col in enumerate(cols):
            table_dict[col] = values[:, i].tolist()
            
        return table_dict
    
    else:
        
        return {} 

In [8]:
contract

Unnamed: 0,document_id,project_id,transcription_text,processed_text
0,NMAAHC-004567394_00861,11400,[D104 ENCL]\r\nNorth Carolina\r\nThis indentur...,North Carolina This indenture made entered Thi...
1,NMAAHC-004567394_00957,11400,[E 11 ENCL] \r\n\r\n\r\nState of North Carolin...,State of North Carolina Currituck County . Thi...
2,NMAAHC-004567394_00958,11400,"near Lizzie Etheridge's gate, thence South 56 ...","near Lizzie Etheridges gate , thence South 56 ..."
3,NMAAHC-004567395_00439,11406,Copy\r\n\r\nAgreement of Apprenticeship\r\nBu ...,Copy Agreement of Apprenticeship Bu Mrs. Kate ...
4,NMAAHC-004567395_00440,11406,[H 132 ENCL] \r\n\r\nCopy\r\n\r\nAgreement of ...,Copy Agreement of Apprenticeship . This agreem...
...,...,...,...,...
23724,NMAAHC-007677332_01525,41539,\N,N
23725,NMAAHC-007677332_01526,41539,\N,N
23726,NMAAHC-007677332_01527,41539,\N,N
23727,NMAAHC-007677332_01528,41539,Jo,Jo


In [9]:
t=[]
t.append("""[[7 column table]]
No. | NAME. | AGE. | WAGES. DOLLS. | [[WAGES.]] CTS. | MONTHLY PAYM'T. DOLLS. | [[MONTHLY PAYM'T.]] CTS.
[[/preprinted]]

1 | Sarah | 17 | 2 | 50 | 1 | 75 """)
t.append("""[[5 columned table]]
| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         
| --- | --- | --- | --- | --- | 
| 1861 | Phillip Carr | 13 | 100 | 00 |""")
t.append("""|NAMES.|AGE.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Frances her x mark|16|none|none.


DEPENDENTS.
|NAMES.|AGE.|NAMES.|AGE.|""")
t.append("""[[6 Columned Table]]
| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |
|---|---|---|---|---|---|
| Franklin | 21 | Male | 1st | $8.00 |
[[/6 columned table]]
""")
t.append("""[[3 columned table]]
| --- | --- | --- |
| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. 
M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | [[note]]

| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his crops |

| June 18 | Approved Contract between David S. Henry and Freedmen Parson Avery, Jackson Sanders and Reuben Sanders for farming dated June 6th 1866. | David S. Avery to furnish Lands Seeds and Teams. Parson Avery Jackson Sanders and Reuben Sanders freedmen to perform the manual labor necessary to make crops with said teams & Seeds and Lands furnished to keep up the fences and care properly for the stock and implements, and receive one third (1/3) of the crops. 
Sd. David S. Avery to receive two thirds (2/3) of the crops, and afford each of said Freedman a garden for Corn, Potatoes &c, &c rent free. | """)
t.append("""
[[5 column table]]

|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Margery X (her mark) Russell|30|female|none|none|

[[/5 column table]]

DEPENDENTS.

[[6 column table]]

|NAMES.|AGE.|SEX.|NAMES.|AGE.|SEX.|
|Phillis|9|female|   |   |   |
|Emma|6|female|   |   |   |

[[/6 column table]]""")
t.append("""|No.|Name|Age|Rate of pay per month. Dols Cts.
|1|Martha Willis|21|   | """)
t2=t

In [10]:
tb=["""[[table]]
No. | NAME. | AGE. | ^[[Twelve months]] WAGES. DOLLS. CTS. | MONTHLY PAYM'T. DOLLS. CTS. [[/preprinted]]
|Phil Shaver His x mark|33|$144|00|2|00
|John Leo His x mark|16|72|00|1|50
|Mumford Randleman His x mark|25|120|00|2|00
|Adam Fisher His x mark|25|144|00|2|00
|Emma Fisher Her x mark|19|72|00|2|00
|Joseph Thompson His x mark|19|144|00|2|00
|Henry Kelly Ale His mark x|20|144|00|2|00
|Samuel Gillespie His x mark|22|120|00|2|00
|Nathan Foard His x mark|40|144|00|2|00
|Jordan Cox, His x mark|19|120|00|2|00
|Ann Tyson Her x mark|18|84|00|2|00
|Liza Dixon Her x mark|30|84|00|2|00
|Ceily Auston Her x mark||25|00|84|00
|Leon Foard His x mark|13|72|00|2|00
|Silas Foard His x mark|10|60|00|2|00
|Alfred P. Dixon His x mark|10|48|00|2|00
|Mary Hall Her x mark|25|72|00|2|00
|Hicksey Her x mark|25|72|00|2|00
|Dennis His x mark|14|48|00|2|00
|George Little & His x mark|31|120|00|2|00
|Wife|23|96|00|1|00
|Will Alfred & His x mark|28|120|00|2|00
|Wife|19|96|00|2|00
|Henry Myers His x mark|22|144|00|2|00
|Thomas Granderson His x mark|24|120|00|2|00
|Fred Deaton His x mark|23|120|00|2|00
|[[Illegible - final line of page is cut off]]|14|72|00|2|00
[[/Table]]"""]
tb.append("""[[3 column table - no headers]]
|---|---|---|
|Stephen Barni-|25|Male|
|Eve. wife|22|F|
|Martin Morrsham|18.|M.|
|Wm Marshall|22|M.|
|Oliver Hudgrim|19|M.|
|John Lunford?|16|M.|
|George Hargroves-|21|M.|
|Richard Gibbs|15|M.|
|& Babe Fannie Martin|19|F|
|Senior Gates|33|M|
|Bates Hiate|22|M|
|John Hiate-|21|M|
|Ned Sides-|22|M|
|Lawson Patterson|22|M|
|John Riverly|30|M|
|Soleman Miller|28|M|
|Moses Todd|23|M|
|Peter Stubbs|33|M|
|Sarah wife|28|F|
|John Sim|12|M.|
|Sam Jones|38|M|
|Celia wife-|34|F|
|Arter Son|16|M|
|Sallie daughter|14|F|
|Jim son|12|M|
|Epraim Bennet|19|Male|
|Sam Marsh|24|Male| """)

In [11]:
print(tb[1])

[[3 column table - no headers]]
|---|---|---|
|Stephen Barni-|25|Male|
|Eve. wife|22|F|
|Martin Morrsham|18.|M.|
|Wm Marshall|22|M.|
|Oliver Hudgrim|19|M.|
|John Lunford?|16|M.|
|George Hargroves-|21|M.|
|Richard Gibbs|15|M.|
|& Babe Fannie Martin|19|F|
|Senior Gates|33|M|
|Bates Hiate|22|M|
|John Hiate-|21|M|
|Ned Sides-|22|M|
|Lawson Patterson|22|M|
|John Riverly|30|M|
|Soleman Miller|28|M|
|Moses Todd|23|M|
|Peter Stubbs|33|M|
|Sarah wife|28|F|
|John Sim|12|M.|
|Sam Jones|38|M|
|Celia wife-|34|F|
|Arter Son|16|M|
|Sallie daughter|14|F|
|Jim son|12|M|
|Epraim Bennet|19|Male|
|Sam Marsh|24|Male| 


In [12]:
print(t2[4])

[[3 columned table]]
| --- | --- | --- |
| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. 
M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | [[note]]

| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his crops |

| June

In [13]:
t1=t2

In [14]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0

for table_text in t2:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")   
        
        condition_1 = len(separator_counts) == 0 or separator_count == separator_counts[-1]
        condition_2 = separator_count != 0
            
        #If you find the beginning of a table, take note
        if condition_1 and condition_2:
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0:
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        
    if streak > 0:
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            table = lines[pair[0]-1:pair[1]]
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
        
        
    idx += 1

In [15]:
tables_extracted

[[],
 ['| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 [],
 ['| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|'],
 [],
 ['|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|',
  '|Margery X (her mark) Russell|30|female|none|none|'],
 []]

In [16]:
dcts = []

for i, example in enumerate(tables_extracted):
    if example != [] and len(example) > 2:
        dcts.append(parse_table_to_dict(example))
    else:
        dcts.append({})

In [17]:
dcts

[{}, {}, {}, {}, {}, {}, {}]

In [18]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0

for table_text in t2:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    lns=[]
#     """i removed lines without separator """
#     for j in lines:
#          if  j.count("|") ==0:
#                 lines.remove(j)
                
        
#    print(lines , "\n\n")
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")
        
#         if separator_count == 0:
#             continue
            
        
        #print(line ,separator_count, "\n\n")
        
        """i add the condition where the next line contain more and less separators than the previous line """
        
        condition_1 = len(separator_counts) == 0  or separator_count == separator_counts[-1] or separator_count > separator_counts[-1] or separator_count < separator_counts[-1]
        condition_2 = separator_count != 0
            
         #If you find the beginning of a table, take note
        if (condition_1 and condition_2) :
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0  :
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        
    
    if (streak >= 0):
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            table = lines[pair[0]-1:pair[1]]
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
     
    
    idx +=1

In [19]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

5


In [20]:
print(t2[1]) #separators less than the previous lines

[[5 columned table]]
| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         
| --- | --- | --- | --- | --- | 
| 1861 | Phillip Carr | 13 | 100 | 00 |


In [21]:
tables_extracted

[['7 column table',
  "No. | NAME. | AGE. | WAGES. DOLLS. | WAGES. CTS. | MONTHLY PAYM'T. DOLLS. | MONTHLY PAYM'T. CTS."],
 ['5 columned table',
  '| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 [],
 ['6 Columned Table',
  '| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|',
  '| Franklin | 21 | Male | 1st | $8.00 |'],
 ['3 columned table',
  '| --- | --- | --- |',
  "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. ",
  'M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to han

In [22]:
print(t2[5])


[[5 column table]]

|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Margery X (her mark) Russell|30|female|none|none|

[[/5 column table]]

DEPENDENTS.

[[6 column table]]

|NAMES.|AGE.|SEX.|NAMES.|AGE.|SEX.|
|Phillis|9|female|   |   |   |
|Emma|6|female|   |   |   |

[[/6 column table]]


In [23]:
print(t2[2])

|NAMES.|AGE.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Frances her x mark|16|none|none.


DEPENDENTS.
|NAMES.|AGE.|NAMES.|AGE.|


In [24]:
tb[0]

"[[table]]\nNo. | NAME. | AGE. | ^[[Twelve months]] WAGES. DOLLS. CTS. | MONTHLY PAYM'T. DOLLS. CTS. [[/preprinted]]\n|Phil Shaver His x mark|33|$144|00|2|00\n|John Leo His x mark|16|72|00|1|50\n|Mumford Randleman His x mark|25|120|00|2|00\n|Adam Fisher His x mark|25|144|00|2|00\n|Emma Fisher Her x mark|19|72|00|2|00\n|Joseph Thompson His x mark|19|144|00|2|00\n|Henry Kelly Ale His mark x|20|144|00|2|00\n|Samuel Gillespie His x mark|22|120|00|2|00\n|Nathan Foard His x mark|40|144|00|2|00\n|Jordan Cox, His x mark|19|120|00|2|00\n|Ann Tyson Her x mark|18|84|00|2|00\n|Liza Dixon Her x mark|30|84|00|2|00\n|Ceily Auston Her x mark||25|00|84|00\n|Leon Foard His x mark|13|72|00|2|00\n|Silas Foard His x mark|10|60|00|2|00\n|Alfred P. Dixon His x mark|10|48|00|2|00\n|Mary Hall Her x mark|25|72|00|2|00\n|Hicksey Her x mark|25|72|00|2|00\n|Dennis His x mark|14|48|00|2|00\n|George Little & His x mark|31|120|00|2|00\n|Wife|23|96|00|1|00\n|Will Alfred & His x mark|28|120|00|2|00\n|Wife|19|96|00|2|00

In [25]:
t2[0]

"[[7 column table]]\nNo. | NAME. | AGE. | WAGES. DOLLS. | [[WAGES.]] CTS. | MONTHLY PAYM'T. DOLLS. | [[MONTHLY PAYM'T.]] CTS.\n[[/preprinted]]\n\n1 | Sarah | 17 | 2 | 50 | 1 | 75 "

In [26]:
t2[-1]

'|No.|Name|Age|Rate of pay per month. Dols Cts.\n|1|Martha Willis|21|   | '

In [27]:
tables_extracted

[['7 column table',
  "No. | NAME. | AGE. | WAGES. DOLLS. | WAGES. CTS. | MONTHLY PAYM'T. DOLLS. | MONTHLY PAYM'T. CTS."],
 ['5 columned table',
  '| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 [],
 ['6 Columned Table',
  '| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|',
  '| Franklin | 21 | Male | 1st | $8.00 |'],
 ['3 columned table',
  '| --- | --- | --- |',
  "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. ",
  'M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to han

In [28]:
tables_extracted

[['7 column table',
  "No. | NAME. | AGE. | WAGES. DOLLS. | WAGES. CTS. | MONTHLY PAYM'T. DOLLS. | MONTHLY PAYM'T. CTS."],
 ['5 columned table',
  '| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 [],
 ['6 Columned Table',
  '| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|',
  '| Franklin | 21 | Male | 1st | $8.00 |'],
 ['3 columned table',
  '| --- | --- | --- |',
  "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. ",
  'M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to han

In [29]:
separator_counts

[4, 5]

In [30]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

5


In [31]:
# for i in (contract[contract.tables_parsed == {}].sample(n=2).transcription_text.tolist()):

#     print(i)
#     print("\n==========\n")