Read from a local copy of the ClaimReview Database

### Installation script
```
conda create -n ner python=3.6
source activate ner
conda install -c conda-forge spacy -y
conda install ipython jupyter nb_conda nltk numpy -y
conda install -c conda-forge rake_nltk -y
python -m spacy download en
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_lg
```

In [3]:
import spacy
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [14]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [15]:
fc_path = "fact_checks_20180502.txt"

with open(fc_path) as f:
    fc_raw = f.readlines()

In [16]:
print("No. of Claims:", len(fc_raw))

No. of Claims: 8902


Functions to normalise the text

In [17]:
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def strip_html(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def clean_text(data):
    text = re.sub(r'[^\w\s]', ' ', data.lower()).replace("\n", "").replace("  ", " ")
    text = "".join([c for c in text if (c.isalpha() or c == " ")])
    text = text.split(" ")
    output = ""
    for word in text:
        if word not in stopwords.words("english"):
            output = output + " " + word
    return output.strip().replace("  ", " ")

def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in wn_tagged:
        if tag is None:            
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

def norm_text(data,):
    raw = strip_html(data)
    text = clean_text(raw)
    norm_text = lemmatize_sentence(text)
    return norm_text

In [22]:
from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()

# Extraction given the text.
r.extract_keywords_from_text("In August 2018, adhesive replicas of President Donald Trump's recently-destroyed star were placed on the Walk of Fame on Hollywood Boulevard.")

# # Extraction given the list of strings where each string is a sentence.
# r.extract_keywords_from_sentences(<list of sentences>)

# To get keyword phrases ranked highest to lowest.
print(r.get_ranked_phrases())

# To get keyword phrases ranked highest to lowest with scores.
print(r.get_ranked_phrases_with_scores())

['president donald trump', 'hollywood boulevard', 'destroyed star', 'august 2018', 'adhesive replicas', 'walk', 'recently', 'placed', 'fame']
[(9.0, 'president donald trump'), (4.0, 'hollywood boulevard'), (4.0, 'destroyed star'), (4.0, 'august 2018'), (4.0, 'adhesive replicas'), (1.0, 'walk'), (1.0, 'recently'), (1.0, 'placed'), (1.0, 'fame')]


In [19]:
import numpy as np
random_point = np.random.randint(len(fc_raw)-200)
print(random_point)

for fc in fc_raw[random_point:random_point+200]:
    fc = fc.strip("\n")
    fc = fc.replace("</script>", "").replace('<script type="application/ld+json">', "")
    fc = json.loads(fc)
    claim = fc["claimReviewed"]
#     date_published = fc["datePublished"]
    review_url = fc["url"]
    
    print("Claim:", claim)
    r.extract_keywords_from_text(claim)
    r.get_ranked_phrases()
    print(r.get_ranked_phrases_with_scores())
    doc = nlp(claim)
    print([(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents])
    verbs = set()
    for possible_subject in doc:
        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
            verbs.add(possible_subject.head)
    print(verbs)
    print()

3006
Claim: Floyd Mayweather and Conor McGregor have agreed to fight a rematch, to be fought under modified mixed martial arts rules.
[(25.0, 'modified mixed martial arts rules'), (4.0, 'floyd mayweather'), (4.0, 'conor mcgregor'), (1.0, 'rematch'), (1.0, 'fought'), (1.0, 'fight'), (1.0, 'agreed')]
[('Floyd Mayweather', 0, 16, 'PERSON'), ('Conor McGregor', 21, 35, 'PERSON')]
{agreed}

Claim: Gay Canadian filmmaker Chris Ball was severely beaten and bloodied by emboldened Donald Trump supporters shortly after the 2016 election.
[(25.0, 'gay canadian filmmaker chris ball'), (25.0, 'emboldened donald trump supporters shortly'), (4.0, 'severely beaten'), (4.0, '2016 election'), (1.0, 'bloodied')]
[('Gay', 0, 3, 'GPE'), ('Canadian', 4, 12, 'NORP'), ('Chris Ball', 23, 33, 'PERSON'), ('Donald Trump', 81, 93, 'PERSON'), ('2016', 123, 127, 'DATE')]
set()

Claim: "Kanye (West) looks and he sees black unemployment at the lowest it’s been in the history of our country. He sees Hispanic unemploymen

[('California', 1, 11, 'GPE'), ('sixth', 30, 35, 'ORDINAL'), ('Washington', 90, 100, 'GPE'), ('Oregon', 105, 111, 'GPE'), ('California', 113, 123, 'GPE'), ('the United Kingdom', 138, 156, 'GPE'), ('fifth', 171, 176, 'ORDINAL')]
{add, is, surpass}

Claim: Says Ann Kirkpatrick “votes with her party nearly 90 percent of the time.”
[(25.0, 'says ann kirkpatrick “ votes'), (16.0, 'party nearly 90 percent'), (4.0, 'time .”')]
[('Ann Kirkpatrick', 5, 20, 'PERSON'), ('nearly 90 percent', 43, 60, 'PERCENT')]
{Says}

Claim: A kitten died after ingesting a chemical sprayed on a Home Depot Christmas tree.
[(16.0, 'home depot christmas tree'), (4.0, 'kitten died'), (4.0, 'chemical sprayed'), (1.0, 'ingesting')]
[]
{died}

Claim: President Trump tweeted that Native Americans should be deported back to India.
[(9.0, 'president trump tweeted'), (4.0, 'native americans'), (4.0, 'deported back'), (1.0, 'india')]
[('Trump', 10, 15, 'PERSON'), ('Native', 29, 35, 'NORP'), ('Americans', 36, 45, 'NORP'), ('I

[('a half a billion (dollars', 10, 35, 'CARDINAL'), ('EPA', 48, 51, 'ORG'), ('the last two years', 60, 78, 'DATE'), ('3,000', 90, 95, 'CARDINAL'), ('EPA', 112, 115, 'ORG')]
{cut, cut}

Claim: E-mail describes ruse intended to scare away littering Mexican construction workers.
[(36.0, 'scare away littering mexican construction workers'), (16.0, 'mail describes ruse intended'), (1.0, 'e')]
[('Mexican', 55, 62, 'NORP')]
{describes, intended}

Claim: “If that proposal goes through, it will empower countries like Russia, like China, like Iran to be able to censor speech on the Internet, your speech.”
[(14.666666666666666, 'empower countries like russia'), (4.666666666666666, 'like iran'), (4.666666666666666, 'like china'), (4.0, 'speech .”'), (4.0, 'proposal goes'), (4.0, 'censor speech'), (1.0, '“'), (1.0, 'internet'), (1.0, 'able')]
[('Russia', 63, 69, 'GPE'), ('China', 76, 81, 'GPE'), ('Iran', 88, 92, 'GPE')]
{goes, empower, be}

Claim: John F. Kennedy Jr. was a frontrunner for a U.S. Se

[('Trump', 10, 15, 'PERSON'), ('Obama', 55, 60, 'PERSON'), ('the White House', 100, 115, 'ORG')]
{found, stash}

Claim: ISIS called for the assassination of Barron Trump in November 2017.
[(4.0, 'november 2017'), (4.0, 'isis called'), (4.0, 'barron trump'), (1.0, 'assassination')]
[('Barron Trump', 37, 49, 'ORG'), ('November 2017', 53, 66, 'DATE')]
{called}

Claim: California has "the sixth largest economy on planet Earth."
[(9.0, 'sixth largest economy'), (9.0, 'planet earth ."'), (1.0, 'california')]
[('California', 0, 10, 'GPE'), ('sixth', 20, 25, 'ORDINAL'), ('Earth', 52, 57, 'LOC')]
{has}

Claim: “This is bigger than Watergate."
[(4.0, 'watergate ."'), (1.0, '“'), (1.0, 'bigger')]
[('Watergate', 21, 30, 'EVENT')]
{is}

Claim: An image depicts a grieving obstetrician after his patient gave birth to a long-awaited baby and died.
[(9.0, 'patient gave birth'), (4.0, 'image depicts'), (4.0, 'grieving obstetrician'), (4.0, 'awaited baby'), (1.0, 'long'), (1.0, 'died')]
[]
{gave, depicts

[('Iowa', 69, 73, 'GPE')]
{found, is}

Claim: Meryl Streep was fired by director Ron Howard from a from a "Happy Days" reboot project for lying about Donald Trump mocking a disabled reporter.
[(9.0, 'donald trump mocking'), (9.0, 'director ron howard'), (4.0, 'reboot project'), (4.0, 'meryl streep'), (4.0, 'happy days'), (4.0, 'disabled reporter'), (1.0, 'lying'), (1.0, 'fired')]
[('Meryl Streep', 0, 12, 'PERSON'), ('Ron Howard', 35, 45, 'PERSON'), ('"Happy Days"', 60, 72, 'WORK_OF_ART'), ('Donald Trump', 104, 116, 'PERSON')]
set()

Claim: "A witness will testify against Hillary Clinton."
[(9.0, 'hillary clinton ."'), (1.0, 'witness'), (1.0, 'testify')]
[('Hillary Clinton', 32, 47, 'PERSON')]
{testify}

Claim: Says Ted Cruz said, "There is no place for gays or atheists in my America. None. Our Constitution makes that clear."
[(16.0, 'says ted cruz said'), (4.0, 'constitution makes'), (4.0, 'clear ."'), (1.0, 'place'), (1.0, 'none'), (1.0, 'gays'), (1.0, 'atheists'), (1.0, 'america')]
[

[('Autism', 0, 6, 'ORG'), ('DTaP', 66, 70, 'PERSON'), ('Diphtheria', 72, 82, 'PERSON'), ('Tetanus', 84, 91, 'PERSON'), ('Pertussis', 107, 116, 'GPE')]
set()

Claim: A photograph shows a rhinoceros saving a zebra foal.
[(4.0, 'zebra foal'), (4.0, 'rhinoceros saving'), (4.0, 'photograph shows')]
[]
{shows}

Claim: Adam Saleh was ejected from a Delta flight for speaking Arabic.
[(4.0, 'speaking arabic'), (4.0, 'delta flight'), (4.0, 'adam saleh'), (1.0, 'ejected')]
[('Adam Saleh', 0, 10, 'PERSON'), ('Delta', 30, 35, 'ORG'), ('Arabic', 56, 62, 'NORP')]
set()

Claim: Marshalls employees in Puerto Rico are still being paid despite the damage to stores there caused by Hurricane Maria.
[(4.0, 'puerto rico'), (4.0, 'paid despite'), (4.0, 'marshalls employees'), (4.0, 'hurricane maria'), (1.0, 'stores'), (1.0, 'still'), (1.0, 'damage'), (1.0, 'caused')]
[('Puerto Rico', 23, 34, 'GPE'), ('Hurricane Maria', 101, 116, 'PERSON')]
set()

Claim: Hillary Clinton had a teleprompter hidden on her lectern

[('George W. Bush', 17, 31, 'PERSON'), ('Dallas', 48, 54, 'GPE')]
set()

Claim: Graduation rates in Buffalo’s school district, which were "around 48 percent are now up to 64 percent" since Say Yes Buffalo launched.
[(23.5, 'since say yes buffalo launched'), (8.5, 'around 48 percent'), (5.5, 'buffalo ’'), (4.5, '64 percent'), (4.0, 'school district'), (4.0, 'graduation rates')]
[('Buffalo', 20, 27, 'GPE'), ('around 48 percent', 59, 76, 'PERCENT'), ('up to 64 percent', 85, 101, 'PERCENT'), ('Say Yes Buffalo', 109, 124, 'ORG')]
{were, are, launched}

Claim: NASA had to relabel the size chart used for a condom-like urination contraption built into the Maximum Absorbency Garment space suit system because astronauts refused to choose the "small" size.
[(36.0, 'maximum absorbency garment space suit system'), (16.0, 'like urination contraption built'), (8.0, 'size chart used'), (4.0, 'astronauts refused'), (2.0, 'size'), (1.0, 'small'), (1.0, 'relabel'), (1.0, 'nasa'), (1.0, 'condom'), (1.0, '