In [1]:
import pandas as pd
import numpy as np
import random
from datasets import load_dataset
from eval_helpers import process
from extraction import entry_extraction_model

def sample(dataset, sample_size=1000):
    ids = dataset["lead_id"].unique()
    groups = dataset.groupby("lead_id")
    random.shuffle(ids)
    docs = []
    current_sample = 0
    for _id in ids:
        c = groups.get_group(_id)
        if current_sample + len(c.excerpt) <= sample_size:
            docs.append((c.excerpt.to_list(), c.lead_id.values[0], c.document.values[0],
                          c.project_title.values[0]))
            current_sample += len(c.excerpt)
    return docs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp_dataset = pd.DataFrame(load_dataset("nlp-thedeep/humset", split="test"))
general_dataset = pd.read_csv("general_docs.csv")


In [3]:
nlp_dataset = nlp_dataset[nlp_dataset["document"]!="Unknown"]
general_dataset = general_dataset[~general_dataset["excerpt"].isna()]

In [56]:
len(nlp_dataset), len(general_dataset)

(10815, 103788)

In [59]:
len(general_dataset["project_id"].unique())

65

In [62]:
len(nlp_dataset["project_title"].unique())

46

In [4]:
nlp_docs = sample(nlp_dataset)
general_docs = sample(general_dataset)

In [5]:
nlp_docs[1]

(['[26th Jul - 1st Aug, 2021, Nigeria] Between 26 July and 01 August 2021, a total of 1,940 movements were recorded in the states of Adamawa and Borno. The recorded movements consisted of 1,419 arrivals and 521 departures. Arrivals were recorded at locations in Askira/Uba, Bama, Damboa, Gwoza, and Maiduguri Metropolitan Council Local Government Areas (LGAs) of the most conflict-affected state of Borno and in Fufore, Gombi, Hong, Lamurde, Madagali, Maiha, Michika, Mubi North, Song, Yola North, and Yola South LGAs of Adamawa.'],
 '62156',
 'https://reliefweb.int/sites/reliefweb.int/files/resources/IOM%20Nigeria%20DTM%20Emergency%20Tracking%20Tool%20%28ETT%29%20Report%20No.234%20%2826%20July%20%20-%2001%20August%202021%29.pdf',
 'IMMAP/DFS Nigeria')

In [6]:
### Assuming paragraphs of one lead construct a document
### Assuming each lead to be a page for fast batch prediction
predictions_nlp = entry_extraction_model.predict([x[0] for x in nlp_docs])
predictions_general = entry_extraction_model.predict([x[0] for x in general_docs])

100%|██████████| 1000/1000 [01:05<00:00, 15.31it/s]
100%|██████████| 1000/1000 [01:03<00:00, 15.79it/s]


In [7]:
predictions_nlp["blocks"][0]

{'type': 'text',
 'page': 0,
 'text': 'Aucun cas de décès n’a été signalé. En revanche, 25 nouvelles personnes sont sorties guéries des CTCo et chez les patients suivis à domicile, dont 15 à Kinshasa, 8 au Nord-Kivu et 2 dans le Haut-Uélé.',
 'textOrder': 0,
 'relevant': False}

In [8]:
nlp_relevant = []
current_page = -1

for p in predictions_nlp["blocks"]:
    page = p["page"]
    lead_id = nlp_docs[page][1]
    lead_url = nlp_docs[page][2]
    project_title = nlp_docs[page][3]
    if page!=current_page:
        nlp_relevant.append(([],[], lead_id, lead_url, project_title))
        current_page = page
    relevant = p.get("relevant", False)
    if relevant:
        nlp_relevant[page][1].append(1)
    else:
        nlp_relevant[page][1].append(0)
    
    nlp_relevant[page][0].append(p["text"])


In [9]:
nlp_relevant[0]

(['Aucun cas de décès n’a été signalé. En revanche, 25 nouvelles personnes sont sorties guéries des CTCo et chez les patients suivis à domicile, dont 15 à Kinshasa, 8 au Nord-Kivu et 2 dans le Haut-Uélé.'],
 [0],
 '43852',
 'https://actualite.cd/2020/11/19/rdccovid-19-la-maladie-reprend-de-plus-en-plus-de-terrain-90-nouveaux-cas-positifs',
 'IMMAP/DFS RDC')

In [11]:
positives = pd.DataFrame([[c[0], c[1], [1]*len(c[1]), c[2], c[3], c[4]] for c in nlp_relevant], 
                            columns=["excerpts", "predictions", "labels", "lead_id", "document", "project_title"])


In [12]:
from sklearn.metrics import accuracy_score

In [13]:
positives["accuracy"] = positives[["predictions", "labels"]].apply(lambda x: accuracy_score(x[1], x[0]), axis = 1)

In [14]:
nlp_docs[0]

(['Aucun cas de décès n’a été signalé. En revanche, 25 nouvelles personnes sont sorties guéries des CTCo et chez les patients suivis à domicile, dont 15 à Kinshasa, 8 au Nord-Kivu et 2 dans le Haut-Uélé.'],
 '43852',
 'https://actualite.cd/2020/11/19/rdccovid-19-la-maladie-reprend-de-plus-en-plus-de-terrain-90-nouveaux-cas-positifs',
 'IMMAP/DFS RDC')

In [20]:
positives

Unnamed: 0,excerpts,predictions,labels,lead_id,document,project_title,accuracy
0,[Aucun cas de décès n’a été signalé. En revanc...,[0],[1],43852,https://actualite.cd/2020/11/19/rdccovid-19-la...,IMMAP/DFS RDC,0.0
1,"[[26th Jul - 1st Aug, 2021, Nigeria] Between 2...",[0],[1],62156,https://reliefweb.int/sites/reliefweb.int/file...,IMMAP/DFS Nigeria,0.0
2,"[[8th - 14th August 2021, Cox's Bazar] Both Ho...","[0, 0, 0]","[1, 1, 1]",63388,https://mcusercontent.com/c520ee8fbad80f8ae804...,IMMAP/DFS Bangladesh,0.0
3,"[[19th - 25th Apr 2021, Borno State] 711 susp...",[1],[1],55749,https://www.humanitarianresponse.info/sites/ww...,IMMAP/DFS Nigeria,1.0
4,[Prácticas de tortura dentro de guarniciones m...,[0],[1],56133,http://www.indepaz.org.co/wp-content/uploads/2...,IMMAP/DFS Colombia,0.0
...,...,...,...,...,...,...,...
507,[L’agriculture et l’élevage − principaux moyen...,[0],[1],51991,http://www.fao.org/emergencies/la-fao-en-actio...,IMMAP/DFS Burkina Faso,0.0
508,[Recommendation:Target rural people with messa...,"[0, 0]","[1, 1]",36060,https://reliefweb.int/sites/reliefweb.int/file...,IMMAP/DFS Syria,0.0
509,"[Al 6 de septiembre de 2020, el Ministerio de ...",[0],[1],39594,https://reliefweb.int/sites/reliefweb.int/file...,UNHCR Guatemala,0.0
510,[SUMMARY OF OUTBREAK A Yellow Fever outbreak h...,[1],[1],9552,https://reliefweb.int/sites/reliefweb.int/file...,Nigeria Situation Analysis (OA),1.0


In [25]:
full_documents = []
for index, row in positives.iterrows():
    try:
        full_documents.append(process(row["document"], row["lead_id"]))
    except Exception as e:
        print(f"Error {e}: {row['lead_id']}, {row['document']}")
        continue

Error 'Timed Out'. lead-id: 43852
Error 'Timed Out': 43852, https://actualite.cd/2020/11/19/rdccovid-19-la-maladie-reprend-de-plus-en-plus-de-terrain-90-nouveaux-cas-positifs
https://reliefweb.int/sites/reliefweb.int/files/resources/IOM%20Nigeria%20DTM%20Emergency%20Tracking%20Tool%20%28ETT%29%20Report%20No.234%20%2826%20July%20%20-%2001%20August%202021%29.pdf is a PDF!
lead-id: 62156, type: pdf. correctly processed
https://mcusercontent.com/c520ee8fbad80f8ae80410aa9/files/e54fab69-e368-1373-4e03-77bf6964815d/CXB_Weekly_report_21.32_8_14_Aug_21_Final.pdf is a PDF!


Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-6831446, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut-7754387217785973167: Function inside (args=([Rect(20.520000457763672, 21.47998046875, 578.8800048828125, 21.9599609375), Rect(20.520000457763672, 804.0, 578.8800048828125, 804.47998046875), Rect(20.520000457763672, 130.5799560546875, 578.8800048828125, 144.8599853515625), Rect(20.520000457763672, 130.0999755859375, 578.8800048828125, 130.5799560546875), Rect(20.520000457763672, 144.8599853515625, 578.8800048828125, 158.17999267578125), Rect(20.520000457763672, 158.179931640625, 578.8800048828125, 172.4599609375), Rect(20.520000457763672, 172.46002197265625, 578.8800048828125, 172.94000244140625), Rect(140.3000030517578, 444.9100036621094, 182.54000854492188, 445.5099792480469),

lead-id: 63388, type: pdf. correctly processed
https://www.humanitarianresponse.info/sites/www.humanitarianresponse.info/files/documents/files/borno_state_measles_outbreak_weekly_sitrep_wk_16_2021.pdf is a PDF!
lead-id: 55749, type: pdf. correctly processed
http://www.indepaz.org.co/wp-content/uploads/2021/05/INFORME-CIDH-VIOLENCIA-POLICIAL-PROTESTA-SOCIAL.pdf is a PDF!
lead-id: 56133, type: pdf. correctly processed
https://fscluster.org/sites/default/files/documents/fss_monthly_dashboards_february_2021.pdf is a PDF!
lead-id: 53581, type: pdf. correctly processed
lead-id: 37817, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/operational_sitrep_covid-19_15_july_2020.pdf is a PDF!
lead-id: 44666, type: pdf. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/ETT-ITURI_Rapport_RULE_Rapport%20068.pdf is a PDF!
lead-id: 63084, type: pdf. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/76740

Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-7575465, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut-5869887866294569410: Function inside (args=([Rect(36.47999954223633, 399.7699890136719, 149.77999877929688, 410.2099914550781), Rect(40.91999816894531, 400.489990234375, 145.33999633789062, 410.2099914550781), Rect(150.25999450683594, 399.7699890136719, 190.6999969482422, 410.2099914550781), Rect(154.6999969482422, 400.489990234375, 186.25999450683594, 410.2099914550781), Rect(191.17999267578125, 399.7699890136719, 314.67999267578125, 410.2099914550781), Rect(195.64999389648438, 400.489990234375, 310.25, 410.2099914550781), Rect(36.0, 399.28997802734375, 36.47999954223633, 400.489990234375), Rect(36.0, 399.28997802734375, 36.47999954223633, 399.7699890136719), Rect(36.479999542

lead-id: 45405, type: pdf. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/Northeast%20Nigeria%20-%20Displacement%20Report%2035%20%28December%202020%29.pdf is a PDF!


Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-10662377, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut-5727213069286055979: Function _search_tables (args=([Rect(299.0989990234375, 160.96502685546875, 299.12298583984375, 162.63201904296875), Rect(299.0719909667969, 240.989501953125, 299.1499938964844, 242.65650939941406), Rect(299.0459899902344, 321.0140380859375, 299.17498779296875, 322.6810302734375), Rect(299.0940246582031, 174.302001953125, 299.12701416015625, 175.968994140625), Rect(299.0711975097656, 244.32379150390625, 299.15020751953125, 245.9907989501953), Rect(299.02398681640625, 497.73541259765625, 299.1969909667969, 499.40240478515625), Rect(299.05560302734375, 291.00494384765625, 299.16558837890625, 292.67193603515625), Rect(299.0325012207031, 361.0265197753906, 299

lead-id: 50967, type: pdf. correctly processed
http://www.insd.bf/contenu/autres_publications/Journal/Journal%20Burkinabe%20de%20la%20Statistique.pdf is a PDF!
Error cannot open broken document, lead-id: 53800
http://caritasvenezuela.org/wp-content/uploads/2019/02/9no-Boletin-SAMAN-Caritas-Venezuela-Oct-Diciembre-2018.pdf is a PDF!
Error cannot open broken document, lead-id: 18994
https://www.humanitarianresponse.info/sites/www.humanitarianresponse.info/files/documents/files/rapport_5mois-covid-19_final_rdc_who23082020.pdf is a PDF!
lead-id: 40392, type: pdf. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/0120_ESPA%C3%91OL_4RONDA_IFN_REV.pdf is a PDF!
lead-id: 47230, type: pdf. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/West%20and%20Central%20Africa%2C%20Displacement%20Tracking%20Matrix%20%28DTM%29%20Monthly%20Regional%20Update%20-%20May%202020.pdf is a PDF!
lead-id: 32283, type: pdf. correctly processed
https://reliefwe

Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-12401907, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut9179035561755237476: Function remove_near_rect (args=([Rect(262.010009765625, 288.530029296875, 262.9700012207031, 289.489990234375), Rect(328.6300048828125, 200.179931640625, 556.2000122070312, 223.4599609375), Rect(262.9700012207031, 352.9700012207031, 328.15399169921875, 353.92999267578125), Rect(490.20001220703125, 397.0059814453125, 491.1600036621094, 417.54998779296875), Rect(114.86000061035156, 397.0059814453125, 115.81999969482422, 417.54998779296875), Rect(188.4499969482422, 417.54998779296875, 189.41000366210938, 418.5099792480469), Rect(115.81999969482422, 352.9700012207031, 188.44400024414062, 353.92999267578125), Rect(262.010009765625, 353.92999267578125, 262.97000

lead-id: 58755, type: pdf. correctly processed
lead-id: 19961, type: website. correctly processed
https://www.sismamujer.org/wp-content/uploads/2021/07/La-Paz-Avanza-con-las-Mujeres-1.pdf is a PDF!
Error cannot open broken document, lead-id: 60993
lead-id: 53691, type: website. correctly processed
Error HTTPSConnectionPool(host='www.sante.gov.bf', port=443): Max retries exceeded with url: /detail?tx_news_pi1%5Baction%5D=detail&tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5Bnews%5D=567&cHash=88285f89ec7b90b4a34d425f758ce557 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f4f9f18fbe0>, 'Connection to www.sante.gov.bf timed out. (connect timeout=5)')). lead-id: 54807
Error HTTPSConnectionPool(host='www.sante.gov.bf', port=443): Max retries exceeded with url: /detail?tx_news_pi1%5Baction%5D=detail&tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5Bnews%5D=567&cHash=88285f89ec7b90b4a34d425f758ce557 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnectio

Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-15080166, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut2567405344937968798: Function inside (args=([Rect(45.35430145263672, 0.0, 259.0022888183594, 0.0), Rect(0.0, 600.7041015625, 595.276, 600.7041015625), Rect(0.0, 565.8043212890625, 0.0, 590.41455078125), Rect(0.0, 609.0656127929688, 0.0, 633.5986328125), Rect(0.0, 609.0656127929688, 0.0, 633.5986328125), Rect(0.0, 609.0656127929688, 0.0, 633.5986328125), Rect(0.0, 565.8043212890625, 0.0, 590.41455078125), Rect(0.0, 565.8043212890625, 0.0, 590.41455078125), Rect(0.0, 583.3025512695312, 0.0, 620.3603515625), Rect(0.0, 581.546875, 0.0, 622.1152954101562), Rect(363.7275085449219, 581.5473022460938, 403.9740905761719, 622.11572265625), Rect(360.2447814941406, 578.0361328125, 407.4568

lead-id: 44901, type: pdf. correctly processed
lead-id: 54230, type: website. correctly processed
https://www.humanitarianresponse.info/sites/www.humanitarianresponse.info/files/documents/files/south_sudan_humanitarian_snapshot_january.pdf is a PDF!
lead-id: 51260, type: pdf. correctly processed
lead-id: 49868, type: website. correctly processed
ContentType error, lead-id: 54633
lead-id: 54633, type: pdf. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/LCBC_Monthly_Dashboard_March_2019_v1_0.pdf is a PDF!
lead-id: 10826, type: pdf. correctly processed
lead-id: 21535, type: website. correctly processed
lead-id: 45293, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/health_sector_bulletin_december_2019.pdf is a PDF!
lead-id: 23290, type: pdf. correctly processed
lead-id: 58337, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/2201_-_multiple_afectacion_en_narino_y_putumay

Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-30383443, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut-8219711711207738804: Function inside (args=([Rect(0.0, 555.719970703125, 596.4000244140625, 589.6799926757812), Rect(72.4800033569336, 72.5, 719.6199951171875, 74.41998291015625), Rect(72.4800033569336, 74.41998291015625, 77.63999938964844, 83.65997314453125), Rect(714.4600219726562, 74.41998291015625, 719.6199951171875, 83.65997314453125), Rect(72.4800033569336, 83.6600341796875, 719.6199951171875, 85.58001708984375), Rect(77.63999938964844, 74.41998291015625, 714.4600219726562, 83.65997314453125), Rect(72.0, 72.02001953125, 72.4800033569336, 72.5), Rect(72.0, 72.02001953125, 72.4800033569336, 72.5), Rect(72.4800033569336, 72.02001953125, 719.6199951171875, 72.5), Rect(719.61

lead-id: 44685, type: pdf. correctly processed
lead-id: 39362, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/Windstorm%20and%20Rainfall%20Damages%20to%20IDP%20Sites%20Flash%20Report%204%20July%202020%20.pdf is a PDF!
lead-id: 32900, type: pdf. correctly processed
Error list index out of range. lead-id: 19708
Error list index out of range: 19708, https://informe21.com/economia/matta-ningun-aumento-solucionara-la-crisis-economica-del-pais
https://reliefweb.int/sites/reliefweb.int/files/resources/ocha_nga_yobestate_weekly_sitrep_11062021.pdf is a PDF!
lead-id: 58129, type: pdf. correctly processed
lead-id: 39294, type: website. correctly processed
lead-id: 47162, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/69691.pdf is a PDF!
lead-id: 13773, type: pdf. correctly processed
lead-id: 44217, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/R%C3%A9publique

Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-34400786, initial daemon)>>
Traceback (most recent call last):
  File "/home/mogady/Desktop/core-server-infra/.conda/lib/python3.8/threading.py", line 889, in _bootstrap
    try:
func_timeout.dafunc.FunctionTimedOut4922209325661409836: Function inside (args=([Rect(39.02299880981445, 27.43798828125, 44.263999938964844, 33.27301025390625), Rect(45.066001892089844, 27.43798828125, 50.307003021240234, 33.27301025390625), Rect(51.106998443603516, 27.43798828125, 56.347999572753906, 33.27301025390625), Rect(57.150001525878906, 27.43798828125, 62.3910026550293, 33.27301025390625), Rect(36.03799819946289, 20.697998046875, 41.27899932861328, 26.53302001953125), Rect(42.0620002746582, 20.697998046875, 47.303001403808594, 26.53302001953125), Rect(48.08700180053711, 20.697998046875, 53.3280029296875, 26.53302001953125), Rect(54.111000061035156, 20.697998046875, 59.35200119018555, 26.53302001953125), 

lead-id: 51962, type: pdf. correctly processed
lead-id: 40509, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/SP-RCRC-Global-Migration-Lab-Locked-down-left-out-COVID19.pdf is a PDF!
lead-id: 51524, type: pdf. correctly processed
lead-id: 45859, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/72195.pdf is a PDF!
lead-id: 20778, type: pdf. correctly processed
lead-id: 41737, type: website. correctly processed
https://reliefweb.int/sites/reliefweb.int/files/resources/77380.pdf is a PDF!
Error local variable '_page' referenced before assignment, lead-id: 32527
Error 'Timed Out'. lead-id: 8716
Error 'Timed Out': 8716, https://www.aljazeera.com/news/2019/01/venezuela-simple-guide-understanding-current-crisis-190130111707920.html
https://reliefweb.int/sites/reliefweb.int/files/resources/informe%20d%20dengue%20SE-29.pdf is a PDF!
lead-id: 16859, type: pdf. correctly processed
https://reliefweb.int/sites/

KeyboardInterrupt: 

In [26]:
len(full_documents)

122

In [30]:
positives

Unnamed: 0,excerpts,predictions,labels,lead_id,document,project_title,accuracy
0,[Aucun cas de décès n’a été signalé. En revanc...,[0],[1],43852,https://actualite.cd/2020/11/19/rdccovid-19-la...,IMMAP/DFS RDC,0.0
1,"[[26th Jul - 1st Aug, 2021, Nigeria] Between 2...",[0],[1],62156,https://reliefweb.int/sites/reliefweb.int/file...,IMMAP/DFS Nigeria,0.0
2,"[[8th - 14th August 2021, Cox's Bazar] Both Ho...","[0, 0, 0]","[1, 1, 1]",63388,https://mcusercontent.com/c520ee8fbad80f8ae804...,IMMAP/DFS Bangladesh,0.0
3,"[[19th - 25th Apr 2021, Borno State] 711 susp...",[1],[1],55749,https://www.humanitarianresponse.info/sites/ww...,IMMAP/DFS Nigeria,1.0
4,[Prácticas de tortura dentro de guarniciones m...,[0],[1],56133,http://www.indepaz.org.co/wp-content/uploads/2...,IMMAP/DFS Colombia,0.0
...,...,...,...,...,...,...,...
507,[L’agriculture et l’élevage − principaux moyen...,[0],[1],51991,http://www.fao.org/emergencies/la-fao-en-actio...,IMMAP/DFS Burkina Faso,0.0
508,[Recommendation:Target rural people with messa...,"[0, 0]","[1, 1]",36060,https://reliefweb.int/sites/reliefweb.int/file...,IMMAP/DFS Syria,0.0
509,"[Al 6 de septiembre de 2020, el Ministerio de ...",[0],[1],39594,https://reliefweb.int/sites/reliefweb.int/file...,UNHCR Guatemala,0.0
510,[SUMMARY OF OUTBREAK A Yellow Fever outbreak h...,[1],[1],9552,https://reliefweb.int/sites/reliefweb.int/file...,Nigeria Situation Analysis (OA),1.0


In [55]:
print(full_documents[0][0]	)

[['AN ERROR OCCURS PROCESSING THIS PAGE'], ['AN ERROR OCCURS PROCESSING THIS PAGE']]


In [51]:
positives.iloc[5]["excerpts"]

['[February 2021, Bangladesh]FUNDING STATUS Total requirement: US$ 247.2m 2%99% Total requirement funded Gap (total requirement minus US$ funded)']