In [1]:
# !pip install flair
import time
import string
import warnings

import regex as re
from tqdm import tqdm


import pickle
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.style as stl 



from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer


warnings.filterwarnings("ignore")

In [2]:
documents = pd.read_csv("../input/skylarfreedman/Documents-Table 1.csv", on_bad_lines='skip', sep=";")

In [3]:
contract = documents[documents.Category == "Contracts"]
contract = contract[["document_id", "project_id", "transcription_text"]]

In [4]:
contract = contract[pd.notnull(contract['transcription_text'])]
contract = contract.reset_index(drop=True)

# Preprocessing

In [5]:
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation
stemmer = SnowballStemmer(language="english")
lemmatizer = WordNetLemmatizer()
stop_words.remove("of")
punct='!"#$%&\'()*+-/<=>?@[\\]^_`{|}~'

def process(s):
    """Removes stop words, words in brackets
    and parentheses, and special characters.
    """
    
    s= re.sub("([\(\[]).*?([\)\]])"," ",s)
    for p in punct:
        s = s.replace(p, '')
    s = word_tokenize(s)
    s = [w for w in s if not w in stop_words] #optional
    return s

In [6]:
contract["processed_text"] = contract["transcription_text"].apply(process)
contract["processed_text"] = contract["processed_text"].apply(lambda x: " ".join(x))

In [7]:
def parse_table_to_dict(example) :
    
    def clean_record(x) :
        if x.strip() == "":
            return " "
        return x.strip().replace("{", "").replace("}", "")

    table_dict = {}

    if "name" in example[0].lower():

        cols = []
        for col in example[0].split("|"):
            if col.strip() == "":
                cols.append(col)
            else:
                cols.append(col.strip().lower().replace(" ", "_").replace(".", ""))

        values = np.array([list(map(clean_record, i.split("|"))) for i in example[1:]])

    else:

        n_cols = len(example[0].split("|"))
        cols = [f"col_{i}" for i in range(n_cols)]

        values = np.array([list(map(clean_record, i.split("|"))) for i in example])
        
    
    #Remove empty rows (["----", "-----", "-----"])
    new_values = []
    for line in values:
        line_chars = list(set("".join([i.strip() for i in line])))
        
        if len(line_chars) != 0 and (line_chars[0] != "-" or len(line_chars) != 0):
            new_values.append(line.tolist())
            
    values = np.array(new_values)
            
    if values.reshape(-1).shape[0] != 0:
        for i, col in enumerate(cols):
            table_dict[col] = values[:, i].tolist()
            
        return table_dict
    
    else:
        
        return {} 

In [8]:
contract

Unnamed: 0,document_id,project_id,transcription_text,processed_text
0,NMAAHC-004567394_00861,11400,[D104 ENCL]\r\nNorth Carolina\r\nThis indentur...,North Carolina This indenture made entered Thi...
1,NMAAHC-004567394_00957,11400,[E 11 ENCL] \r\n\r\n\r\nState of North Carolin...,State of North Carolina Currituck County . Thi...
2,NMAAHC-004567394_00958,11400,"near Lizzie Etheridge's gate, thence South 56 ...","near Lizzie Etheridges gate , thence South 56 ..."
3,NMAAHC-004567395_00439,11406,Copy\r\n\r\nAgreement of Apprenticeship\r\nBu ...,Copy Agreement of Apprenticeship Bu Mrs. Kate ...
4,NMAAHC-004567395_00440,11406,[H 132 ENCL] \r\n\r\nCopy\r\n\r\nAgreement of ...,Copy Agreement of Apprenticeship . This agreem...
...,...,...,...,...
23724,NMAAHC-007677332_01525,41539,\N,N
23725,NMAAHC-007677332_01526,41539,\N,N
23726,NMAAHC-007677332_01527,41539,\N,N
23727,NMAAHC-007677332_01528,41539,Jo,Jo


In [9]:
t=[]
t.append("""[[7 column table]]
No. | NAME. | AGE. | WAGES. DOLLS. | [[WAGES.]] CTS. | MONTHLY PAYM'T. DOLLS. | [[MONTHLY PAYM'T.]] CTS.
[[/preprinted]]

1 | Sarah | 17 | 2 | 50 | 1 | 75 """)
t.append("""[[5 columned table]]
| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         
| --- | --- | --- | --- | --- | 
| 1861 | Phillip Carr | 13 | 100 | 00 |""")
t.append("""|NAMES.|AGE.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Frances her x mark|16|none|none.


DEPENDENTS.
|NAMES.|AGE.|NAMES.|AGE.|""")
t.append("""[[6 Columned Table]]
| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |
|---|---|---|---|---|---|
| Franklin | 21 | Male | 1st | $8.00 |
[[/6 columned table]]
""")
t.append("""[[3 columned table]]
| --- | --- | --- |
| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merritt Sanders one third (1/3) of crops made. 
M.S. is to work on said Lands the whole year. pay one third (1/3) of the hire to hands and furnish some assistance on sd. Lands from his family. | [[note]]

| June 18 | Approved Contract between W.H. Sanders and Thomas Campbell 'fm, for farming dated March 1st 1866- | W.H. Sanders is to furnish lands, one horse, seed and feed for horse and receive two thirds (2/3) of crops. Freedman Thomas Campbell is to furnish himself and produce crops on said Lands and retain one third (1/3) of crops, and to keep the ditches open on said lands, occupied by his crops |

| June 18 | Approved Contract between David S. Henry and Freedmen Parson Avery, Jackson Sanders and Reuben Sanders for farming dated June 6th 1866. | David S. Avery to furnish Lands Seeds and Teams. Parson Avery Jackson Sanders and Reuben Sanders freedmen to perform the manual labor necessary to make crops with said teams & Seeds and Lands furnished to keep up the fences and care properly for the stock and implements, and receive one third (1/3) of the crops. 
Sd. David S. Avery to receive two thirds (2/3) of the crops, and afford each of said Freedman a garden for Corn, Potatoes &c, &c rent free. | """)
t.append("""
[[5 column table]]

|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|Margery X (her mark) Russell|30|female|none|none|

[[/5 column table]]

DEPENDENTS.

[[6 column table]]

|NAMES.|AGE.|SEX.|NAMES.|AGE.|SEX.|
|Phillis|9|female|   |   |   |
|Emma|6|female|   |   |   |

[[/6 column table]]""")
t.append("""|No.|Name|Age|Rate of pay per month. Dols Cts.
|1|Martha Willis|21|   | """)
notex=t
notex.append("""[[7 column table]]
No. | NAME. | Age. | Wages. DOLLS. | CTS. | Monthly Paym't. DOLLS. | CTS.
[[/preprinted]]

- | Church Fisher further agrees to pay Said Laborer | 18 | $2 | 75 | - | -

- | The Laborere to receive (1) one dollar per month each of the several months Specified the balance at expiration of this Contract the amount being $12 75/100 | - | - | - | $1 | 00.

[[/table]]""")
notex.append("""shipment of products shall be made until the Provost Marshal of Freedmen shall certify that all dues to laborers are paid or satisfactorily arranged.
IN TESTIMONY WHEREOF, The said parties have affixed their names to this arrangement, at Yazoo Go
State of Mississippi, on the day and date aforesaid.
[[6 Columned Table]]
|NAMES|AGE|SEX|CLASS|MONTHLY RATE OF WAGES|INTEREST IN PROFITS|
| --- | --- | --- | --- | --- | --- |
|Richard Woodbury | 35 | Male | 1 |   |  
|Dick Woodbury | 20 | Male | 1 |   |  
|Daniel Woodbury | 36 | Male | 1 |  |   
|William Woodbury | 20 | Male | 1 |  |  
|March Woodbury | 60 | Male | 1 |   |
|Rachel Simmons  |55 |   |   |   |
Geo. W. Woodbury
Witness - John. H. McCalister
""")
notex.append("""And the said ^[[James Chadwick]] has agreed that he will furnish the said laborers with comfortable quarters, sufficient rations, and the amount of money per month, which stands opposite their respective names; that he will treat them kindly, and encourage the establishment of schools for their children.

[[7 column table]]
No. | NAME. | AGE. | WAGES. DOLLS. | [[WAGES.]] CTS. | MONTHLY PAYM'T. DOLLS. | [[MONTHLY PAYM'T]] CTS.
[[/preprinted]]

1 | Jerry Pigott | 17 | 8 | 00 | 4 | 00

[[/7 column table]]

[[signature]] Richard Dillon [[/signature]]""")
notex.append("""[[4 Columned Table]]
| Name. | Age. | No. of Dependents. | Remarks. | 
| --- | --- | --- | --- |

| Sally Brown His X Mark | 30 | 2 | under four years of age
| Captain His X Mark | 51 |   |
| Harrison His X Mark | 23 |   |
| Taner His X Mark  57 |   |
| Ida  Her X Mark | 19 |   |
| Louisia Her X Mark | 40 | 4 under ten years of age |
| Dallace His X Mark 20 |   |
| Vick His X Mark | 14 |   |  
| Minnie His X Mark | 55 |   | Rheumatism unable to support himself |
| Mattie Her X Mark | 54 |   |
| Hester Her X Mark | 28 | 6 | under ten years of age |   

No 198.
A. C. Jennings""")
notex.append("""In witness whereof the parties hereto affix their hands and seals on the day and year first above written Asher & Boyed {Seal}
[[7 Columned Table]]
|Names|age|Pay pr Mo-|Pay pr Year|Remarks|
|---|---|---|---|---|---|---|
|Frank his x mark Princeton|32|   |   |$150|00|Began 1st January 1866|
|James his x mark Armstrong|61|   |   |$100|00|Began 30th day January 1866
|Daniel x Johnston|36|   |   |$175|00|Began 5th day of February 1866|""")
notex.append("""[[5 columned table]]
|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|
|---|---|---|---|---|
|George X his mark|18|male|Food clothing medical attention|
|Susannah X mark|15|Female|Food clothing medical attention|
|Silva X mark|55|Female|Food clothing medical attention|
|Beck X mark|24|Female|Food clothing medical attention|
|Patiric X mark|55|male|Food clothing medical attention|

[[left margin]]Paid[[/left margin]]""")
notex.append("""[[3 columned table]]
DEPENDENTS
|NAMES.|AGE.|SEX.|
|---|---|---|
|Westly|7|male|
|Charley|3|male|""")
notex.append("""IN TESTIMONY WHEREOF, the said parties have affixed their names to this agreement, at Brookhaven State Mississippi of [[blank]] on the day and date aforesaid.

[[6 columned table]]

| NAMES. | AGE. | SEX. | CLASS. | MONTHLY RATE OF WAGES | INTEREST IN PROFITS. |
|---|---|---|---|---|---|
[[paired]]
| Ellie, | 55, | Male, | 3 | 2 girls. | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Julia, Ann, - | 45, | Fem, | 3 | 2 Boys. | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |
[[/paired]]

| John, | 60, | Male, | 3 | Maintenance, | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Jim, | 27 ,| Male | 3 | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Emmanuel, | 18, | Male | 3 | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Arthur, | 17, | Male | 3 | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Andy, | 15, | Male | 3 | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Emoline, | 18, | Fem | 3 | $2.00. | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

| Ann, | 21, | Fem | 3 | $2.00. | (4) Four acres of Land planted with corn, and (60) Sixty Bushels of Corn to be devided. |

Executed in presence of 
1st lieut. Pro. Mar. of Freedmen""")

In [10]:
len(notex)

15

In [11]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0

for table_text in notex:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")   
        
        condition_1 = len(separator_counts) == 0 or separator_count == separator_counts[-1]
        condition_2 = separator_count != 0
            
        #If you find the beginning of a table, take note
        if condition_1 and condition_2:
            if streak == 0:
                start = line_n
            streak += 1
        else:
            if streak != 0:
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        
    if streak > 0:
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            table = lines[pair[0]-1:pair[1]]
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
        
        
    idx += 1

In [12]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

10


In [13]:
tables_extracted

[[],
 ['| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 [],
 ['| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|'],
 [],
 ['|NAMES.|AGE.|SEX.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|',
  '|Margery X (her mark) Russell|30|female|none|none|'],
 [],
 ['- | Church Fisher further agrees to pay Said Laborer | 18 | $2 | 75 | - | -',
  '- | The Laborere to receive (1) one dollar per month each of the several months Specified the balance at expiration of this Contract the amount being $12 75/100 | - | - | - | $1 | 00.'],
 ['|NAMES|AGE|SEX|CLASS|MONTHLY RATE OF WAGES|INTEREST IN PROFITS|',
  '| --- | --- | --- | --- | --- | --- |'],
 [],
 ['| Name. | Age. | No. of Dependents. | Remarks. | ',
  '| --- | --- | --- | --- |'],
 ['|---|---|---|---|---|---|---|',
  '|Frank his x mark Princeton|32|   |   |$150|00|Began 1st January 1866|'],
 ['|NAMES.|AG

In [14]:
dcts = []

for i, example in enumerate(tables_extracted):
    if example != [] and len(example) > 2:
        dcts.append(parse_table_to_dict(example))
    else:
        dcts.append({})

In [15]:
dcts

[{},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {'': [' ', ' '],
  'names': ['---', 'Westly'],
  'age': ['---', '7'],
  'sex': ['---', 'male']},
 {}]

In [16]:
# contract["tables_parsed"]=dcts

In [17]:
nb=0
for example in (dcts):
    if len(example)> 0 :
        nb=nb+1
print(nb)

1


In [18]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0
sp=[]

for table_text in notex:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    lns=[]
#     """i removed lines without separator """
#     for j in lines:
#          if  j.count("|") ==0:
#                 lines.remove(j)
    
    #print(lines , "\n\n")
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")
        
#         if separator_count == 0:
#             continue
        
        
        
        #print(line ,separator_count, "\n\n")
        
        """i add the condition where the next line contain more and less separators than the previous line """
        
        condition_1 = len(separator_counts) == 0  or separator_count == separator_counts[-1] or separator_count > separator_counts[-1] or separator_count < separator_counts[-1]
        condition_2 = separator_count != 0
        condition_22= len(separator_counts) == 0 
        condition_3 = len(separator_counts) !=0 and separator_count ==0
            
         #If you find the beginning of a table, take note
        if (condition_1 and condition_2) or (condition_22 and condition_3)  :
            if streak == 0 :
                start = line_n
            streak += 1
        elif condition_1 or condition_3 :
            if  streak ==0 :
                
                start = line_n
            streak +=1
        else:
            if streak != 0  :
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        a=separator_count
        
    sp.append(separator_counts)
    
    
    if (streak > 0):
        
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
       
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            
            "remove(-1) : it extrcat more tables , and in the already exist table , there is difference just the description like <3 column table> ... " 
            table = lines[pair[0]:pair[1]]
            
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
     
    
    idx +=1

In [19]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

15


In [20]:
#  print(all_table_pairs) , len(all_table_pairs)

In [21]:
# sp

In [22]:
tables_extracted[10]

['4 Columned Table',
 '| Name. | Age. | No. of Dependents. | Remarks. | ',
 '| --- | --- | --- | --- |',
 '| Sally Brown His X Mark | 30 | 2 | under four years of age',
 '| Captain His X Mark | 51 |   |',
 '| Harrison His X Mark | 23 |   |',
 '| Taner His X Mark  57 |   |',
 '| Ida  Her X Mark | 19 |   |',
 '| Louisia Her X Mark | 40 | 4 under ten years of age |',
 '| Dallace His X Mark 20 |   |',
 '| Vick His X Mark | 14 |   |  ',
 '| Minnie His X Mark | 55 |   | Rheumatism unable to support himself |',
 '| Mattie Her X Mark | 54 |   |',
 '| Hester Her X Mark | 28 | 6 | under ten years of age |   ',
 'No 198.']

In [23]:
tables_extracted

[['7 column table',
  "No. | NAME. | AGE. | WAGES. DOLLS. | WAGES. CTS. | MONTHLY PAYM'T. DOLLS. | MONTHLY PAYM'T. CTS.",
  '/preprinted'],
 ['5 columned table',
  '| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 ['|NAMES.|AGE.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|',
  '|Frances her x mark|16|none|none.',
  'DEPENDENTS.'],
 ['6 Columned Table',
  '| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|',
  '| Franklin | 21 | Male | 1st | $8.00 |'],
 ['3 columned table',
  '| --- | --- | --- |',
  "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merri

In [24]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

15


In [25]:
# print(notex[7]) #separators less than the previous lines


In [26]:
notex[0]

"[[7 column table]]\nNo. | NAME. | AGE. | WAGES. DOLLS. | [[WAGES.]] CTS. | MONTHLY PAYM'T. DOLLS. | [[MONTHLY PAYM'T.]] CTS.\n[[/preprinted]]\n\n1 | Sarah | 17 | 2 | 50 | 1 | 75 "

In [27]:
tables_extracted

[['7 column table',
  "No. | NAME. | AGE. | WAGES. DOLLS. | WAGES. CTS. | MONTHLY PAYM'T. DOLLS. | MONTHLY PAYM'T. CTS.",
  '/preprinted'],
 ['5 columned table',
  '| No. | NAMES. | AGE. | RATE of PAY per month. Dols. | RATE of PAY per month. Cents. |         ',
  '| --- | --- | --- | --- | --- | '],
 ['|NAMES.|AGE.|MONTHLY RATE OF WAGES.|INTEREST IN PROFITS.|',
  '|Frances her x mark|16|none|none.',
  'DEPENDENTS.'],
 ['6 Columned Table',
  '| NAME | AGE | SEX | CLASS | MONTHLY RATE OF WAGES. | INTEREST IN PROFITS. |',
  '|---|---|---|---|---|---|',
  '| Franklin | 21 | Male | 1st | $8.00 |'],
 ['3 columned table',
  '| --- | --- | --- |',
  "| 1866 June 16th | Approved Contracted between John L. Banks and Merrit Sanders 'fm, for farming this year | J.L.B. is to furnish Lands, teams, Seed, and farming implements. Three (3) Bls of corn, Two Hundred and fifty (250) Lbs of Pork. Leather for two (2) pair of Shoes (Mens). Pay two thirds (2/3) of wages paid to hired hands and give sd. Merri

In [28]:
print(notex[11])

In witness whereof the parties hereto affix their hands and seals on the day and year first above written Asher & Boyed {Seal}
[[7 Columned Table]]
|Names|age|Pay pr Mo-|Pay pr Year|Remarks|
|---|---|---|---|---|---|---|
|Frank his x mark Princeton|32|   |   |$150|00|Began 1st January 1866|
|James his x mark Armstrong|61|   |   |$100|00|Began 30th day January 1866
|Daniel x Johnston|36|   |   |$175|00|Began 5th day of February 1866|


In [29]:
tables_extracted[11]

['In witness whereof the parties hereto affix their hands and seals on the day and year first above written Asher & Boyed {Seal}',
 '7 Columned Table',
 '|Names|age|Pay pr Mo-|Pay pr Year|Remarks|',
 '|---|---|---|---|---|---|---|',
 '|Frank his x mark Princeton|32|   |   |$150|00|Began 1st January 1866|',
 '|James his x mark Armstrong|61|   |   |$100|00|Began 30th day January 1866']

In [30]:
separator_count ="|Frank his x mark Princeton|32|   |   |$150|00|Began 1st January 1866|".count("|")
separator_count

8

In [31]:
tables_extracted[13]

['3 columned table',
 'DEPENDENTS',
 '|NAMES.|AGE.|SEX.|',
 '|---|---|---|',
 '|Westly|7|male|']

In [32]:
nb=0
for example in (tables_extracted):
    if len(example)> 0 :
        nb=nb+1
print(nb)

15


In [33]:
# for i in (contract[contract.tables_parsed == {}].sample(n=2).transcription_text.tolist()):

#     print(i)
#     print("==========")

In [34]:
contract_text = contract.transcription_text.tolist()

lns = []
tables_extracted = []
all_table_pairs = []
idx = 0
sp=[]

for table_text in notex:

    tmp_tables = []
    table_text = table_text.replace("[", "").replace("]", "").replace("^", "")
    
    lines = table_text.split("\n")
    lines = [i for i in lines if len(i.strip()) != 0]
    separator_counts = []
    table_pairs  = []
    
    start = 0
    end = 0
    streak = 0
    lns=[]
#     """i removed lines without separator """
#     for j in lines:
#          if  j.count("|") ==0:
#                 lines.remove(j)
    
    #print(lines , "\n\n")
    
    for line_n, line in enumerate(lines):
        
        separator_count = line.count("|")
        
#         if separator_count == 0:
#             continue
        
        
        
        #print(line ,separator_count, "\n\n")
        
        """i add the condition where the next line contain more and less separators than the previous line """
        
        condition_1 = len(separator_counts) == 0  or separator_count == separator_counts[-1] or separator_count > separator_counts[-1] or separator_count < separator_counts[-1]
        condition_2 = separator_count != 0
        condition_22= len(separator_counts) == 0 
        condition_3 = len(separator_counts) !=0 and separator_count ==0
            
         #If you find the beginning of a table, take note
        if (condition_1 and condition_2) or (condition_22 and condition_3)  :
            if streak == 0 :
                start = line_n
            streak += 1
        elif condition_1 or condition_3 :
            if  streak ==0 :
                print(streak , "milieu \n")
                start = line_n
            streak +=1
        else:
            if streak != 0  :
                table_pairs.append((start, line_n))
                all_table_pairs.append((start, line_n, idx))
            streak = 0
            start = 0
    
        separator_counts.append(separator_count)
        a=separator_count
        
    sp.append(separator_counts)
    
    
    if (streak > 0):
        print(streak , "loota \n")
        table_pairs.append((start, line_n))
        all_table_pairs.append((start, line_n, idx))
       
        
    if len(table_pairs) > 0:
        for i, pair in enumerate(table_pairs):
            
            "remove(-1) : it extrcat more tables , and in the already exist table , there is difference just the description like <3 column table> ... " 
            table = lines[pair[0]:pair[1]]
            
            tmp_tables.append(table)
            
        tables_extracted.append(tmp_tables[0])
        
    else:
        tables_extracted.append([])
     
    
    idx +=1

0 milieu 

4 loota 

0 milieu 

4 loota 

4 loota 

0 milieu 

5 loota 

0 milieu 

7 loota 

0 milieu 

10 loota 

2 loota 

0 milieu 

6 loota 

0 milieu 

14 loota 

0 milieu 

7 loota 

0 milieu 

16 loota 

0 milieu 

7 loota 

0 milieu 

9 loota 

0 milieu 

6 loota 

0 milieu 

17 loota 

