Summary:
In this notebook, we used IBM API to do the entity recognition.

## import data

In [1]:
import pandas as pd

In [2]:
gold_standard=pd.read_csv("wri/policy-toolkit-master/data/processed/gold_standard.csv")
noisy=pd.read_csv("wri/policy-toolkit-master/data/processed/noisy.csv")


In [13]:
gold_standard.head()

Unnamed: 0,label,sentences,class
0,0,the purpose of the middle level institutions i...,1.0
1,1,"to address these challenges, the government wi...",1.0
2,2,lack of a database and information to support ...,3.0
3,3,to contribute to efforts to reduce poverty and...,3.0
4,4,secretariat. its terms of reference will inclu...,1.0


In [16]:
gold_standard.shape

(1033, 3)

In [14]:
noisy.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,sentences,name,country,page,languages
0,0,3,part ii farm and 6. maintenance of 1 0 per c...,../data/raw/kenya/results/Agriculture Rules/1.txt,Kenya,1,en
1,1,4,1 0. harvesting. 1 1. farm forestry compensati...,../data/raw/kenya/results/Agriculture Rules/1.txt,Kenya,1,en
2,2,21,these rules may be cited as the agriculture fa...,../data/raw/kenya/results/Agriculture Rules/2.txt,Kenya,2,en
3,3,31,these rules shall apply for the purposes of pr...,../data/raw/kenya/results/Agriculture Rules/2.txt,Kenya,2,en
4,4,16,"in these rules, unless the context otherwise r...",../data/raw/kenya/results/Agriculture Rules/2.txt,Kenya,2,en


In [36]:
noisy.shape

(16146, 7)

## An example

In [46]:
# IBM waston NLU
import json
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_watson.natural_language_understanding_v1 import Features, EntitiesOptions

txt=df.sentences[0]

#after registering on ibm cloud, we can get free apikey and url
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    iam_apikey='******',
    url='******'
)

response = natural_language_understanding.analyze(
    text=txt,
    features=Features(entities=EntitiesOptions(sentiment=False, limit=20))).get_result()

print(json.dumps(response, indent=2))

{
  "usage": {
    "text_units": 1,
    "text_characters": 664,
    "features": 1
  },
  "language": "en",
  "entities": [
    {
      "type": "Organization",
      "text": "ministerial coordination committee",
      "relevance": 0.958279,
      "count": 1,
      "confidence": 0.535767
    },
    {
      "type": "Organization",
      "text": "asds",
      "relevance": 0.893874,
      "disambiguation": {
        "name": "American_Society_of_Dental_Surgeons",
        "dbpedia_resource": "http://dbpedia.org/resource/American_Society_of_Dental_Surgeons"
      },
      "count": 1,
      "confidence": 0.691308
    },
    {
      "type": "Organization",
      "text": "icc",
      "relevance": 0.786463,
      "count": 2,
      "confidence": 0.997123
    }
  ]
}


## 3. Use IBM API to do Entity Recognition on gold_standard dataset and noisy dataset

In [22]:
NER_results_gold=[]
i=0
for sent in gold_standard.sentences:
    txt=sent
    response = natural_language_understanding.analyze(
        text=txt,
        features=Features(entities=EntitiesOptions(sentiment=False, limit=20))).get_result()
    if len(response['entities'])>0:
        for j in range(len(response['entities'])):
            NER_results_gold.append([i, response['entities'][j]['type'], response['entities'][j]['text'], 
                                response['entities'][j]['relevance'], response['entities'][j]['confidence']])
    i+=1

In [24]:
NER_results_gold[:5]

[[0, 'Organization', 'ministerial coordination committee', 0.958279, 0.535767],
 [0, 'Organization', 'asds', 0.893874, 0.691308],
 [0, 'Organization', 'icc', 0.786463, 0.997123],
 [1, 'Organization', 'government', 0.978348, 0.685561],
 [6, 'Person', 'kenyas', 0.963296, 0.399072]]

In [26]:
NER_results_gold_file=pd.DataFrame(NER_results_gold, columns=['index','type','text','relevance','confidence'])

In [None]:
NER_results_noisy=[]
i=0
for sent in noisy.sentences:
    txt=sent
    response = natural_language_understanding.analyze(
        text=txt,
        features=Features(entities=EntitiesOptions(sentiment=False, limit=20))).get_result()
    if len(response['entities'])>0:
        for j in range(len(response['entities'])):
            NER_results_noisy.append([i, response['entities'][j]['type'], response['entities'][j]['text'], 
                                response['entities'][j]['relevance']])
    i+=1

In [49]:
NER_results_noisy_file=pd.DataFrame(NER_results_noisy, columns=['index','type','text','relevance'])

In [50]:
NER_results_gold_file.to_csv('yg2619/policy-toolkit/columbia_codes/data/NER_results_gold_file.csv',index=False)
NER_results_noisy_file.to_csv('yg2619/policy-toolkit/columbia_codes/data/NER_results_noisy_file.csv',index=False)

In [28]:
NER_results_gold_file.head(10)

Unnamed: 0,index,type,text,relevance,confidence
0,0,Organization,ministerial coordination committee,0.958279,0.535767
1,0,Organization,asds,0.893874,0.691308
2,0,Organization,icc,0.786463,0.997123
3,1,Organization,government,0.978348,0.685561
4,6,Person,kenyas,0.963296,0.399072
5,6,Organization,government,0.491032,0.948379
6,7,Organization,rural electrification authority,0.978348,0.630186
7,8,Organization,ministry of youth and sports,0.958279,0.742694
8,8,Organization,youth enterprise development fund,0.48552,0.489316
9,8,Organization,constituency development fund,0.348411,0.309545


In [51]:
NER_results_noisy_file.head(10)

Unnamed: 0,index,type,text,relevance
0,0,Facility,part ii farm,0.978348
1,4,Organization,district agricultural committee,0.978348
2,8,Organization,district agricultural committee,0.9999
3,9,Organization,district agricultural committee,0.978348
4,11,Organization,district agricultural committee,0.978348
5,11,Organization,district environment committee,0.869258
6,12,Organization,district agricultural committee,0.978348
7,13,Quantity,thirty 3 0 days,0.963296
8,13,Organization,district agricultural committee,0.422088
9,14,Organization,district agricultural committee,0.978348
