# Data Enhancement

In [18]:
import json
import requests
with open("some_data_json.txt") as d:
    data = [json.loads(l) for l in d.readlines()]
    
# NYT Credentials
with open(".credentials", "r") as f:
    key = f.read().strip("\n")

In [13]:
def getSemantic(concept, concept_type, key = key):
    
    '''query the NewYorkTimes semantic API'''
    
    types = {'des':'nytd_des', 'geo':'nytd_geo', 'org':'nytd_org', 'per':'nytd_per'}
    
    if concept_type not in types:
        raise ValueError(f"concept_type must be one of {types}")
        
    # build query
    url = f'http://api.nytimes.com/svc/semantic/v2/concept/name/{types[concept_type]}/{concept}.json?fields=all&api-key={key}'
    
    # query the API, return JSON (as a python dict)
    result_dic = requests.get(url)
    if result_dic.status_code != 200:
        print("Something went wrong...")
    
    return result_dic.json()['results'][0]

### Example 1

In [27]:
ex = data[52]
ex["title"], ex["abstract"]

('Mueller Findings Kick Off a Political Tug of War That’s Only Just Beginning',
 'Democrats appear to be gearing up, with Representative Jerrold Nadler of New York describing the report as a road map to holding the president accountable.')

In [21]:
def printtags(tweet):
    '''print tags'''
    
    def hashtag(hasht):
        '''makes a list of potential hashtags from the NYT tags'''
        
        parentheses = re.compile(r"\((.+)\)")
        
        if isinstance(hasht, str):
            return []
        elif isinstance(hasht, list):
            return [h for h in hasht]
    
    title = tweet['title']
    abstract = tweet['abstract']
    
    all_hashtags = hashtag(tweet['des_facet']) + hashtag(tweet['org_facet']) + \
                    hashtag(tweet['per_facet']) + hashtag(tweet['geo_facet'])
    
    print(title)
    print(abstract)
    print(all_hashtags)

In [28]:
printtags(ex)

Mueller Findings Kick Off a Political Tug of War That’s Only Just Beginning
Democrats appear to be gearing up, with Representative Jerrold Nadler of New York describing the report as a road map to holding the president accountable.
['United States Politics and Government', 'Russian Interference in 2016 US Elections and Ties to Trump Associates', 'Presidential Election of 2016', 'House of Representatives', 'Republican Party', 'Justice Department', 'Senate Committee on the Judiciary', 'Democratic Party', 'Trump, Donald J', 'Mueller, Robert S III', 'Russia']


In [44]:
semantic = getSemantic('Mueller, Robert S III', 'per')
semantic_2 = getSemantic('Republican Party', 'org')
semantic_3 = getSemantic('Russian Interference in 2016 US Elections and Ties to Trump Associates', 'des')

In [55]:
def yago_exists(semantic):
    
    if "links" not in semantic:
        return False
    
    for link in semantic["links"]:
        if link["link_type"] == "wikipedia_raw_name":
            print("You may be able to find this concept in Yago as:")
            print(link["link"])
            return True
        
    return False

In [56]:
yago_exists(semantic)

You may be able to find this concept in Yago as:
Robert_Mueller


True

In [57]:
yago_exists(semantic_2)

You may be able to find this concept in Yago as:
Republican_Party_(United_States)


True

In [59]:
yago_exists(semantic_3)

False

## Strategy
- get tweet from Kafka
- check tweet tags, and if anything can be found about them in Yago.
- pick 1 at random
- retrieve info about them in Yago (w/ Spark)
- pick 1 fact at random, post enhanced data to Kafka

In [118]:
def getTags(tweet):
    
    '''retrieve all tags about organisations, people, and places for a given article'''
    
    def tags(tag):   
        '''lists the NYT tags'''     
        if isinstance(tag, str):
            return []
        elif isinstance(tag, list):
            return [h for h in tag]

    all_tags = {"org":tags(tweet['org_facet']), "per":tags(tweet['per_facet']), "geo":tags(tweet['geo_facet'])}
    
    return all_tags

    
def getSemantic(concept, concept_type, key = key):
    
    '''query the NewYorkTimes semantic API'''
    
    types = {'des':'nytd_des', 'geo':'nytd_geo', 'org':'nytd_org', 'per':'nytd_per'}
    
    if concept_type not in types:
        raise ValueError(f"concept_type must be one of {types}")
        
    if len(concept) == 0:
        return []
        
    # build query
    url = f'http://api.nytimes.com/svc/semantic/v2/concept/name/{types[concept_type]}/{concept}.json?fields=all&api-key={key}'
    
    # query the API, return JSON (as a python dict)
    result_dic = requests.get(url)
    if result_dic.status_code != 200:
        print("Something went wrong...")
        
    if len(result_dic.json()['results']) == 0:
        return []
    
    return result_dic.json()['results'][0]


def yagoExists(semantic):
    
    '''Check whether a given NYT semantic concept could be found in Yago database.
       This uses the fact that Yago concepts use their Wikipedia name.'''
    
    if "links" not in semantic:
        return False
    
    for link in semantic["links"]:
        if link["link_type"] == "wikipedia_raw_name":
            print("You may be able to find this concept on wikipedia:")
            print("https://en.wikipedia.org/wiki/" + link["link"])
            return link["link"]
        
    return False

def extractYago(tags):
    yagoconcepts = []
    for typ in tags:
        semantics = [getSemantic(t, typ) for t in tags[typ]]
        yagoconcepts += [yagoExists(s) for s in semantics]
    return [y for y in yagoconcepts if y]

## Example2

In [119]:
tags = getTags(data[89])

In [120]:
extractYago(tags)

You may be able to find this concept on wikipedia:
https://en.wikipedia.org/wiki/Gabrielle_Giffords
You may be able to find this concept on wikipedia:
https://en.wikipedia.org/wiki/Minnesota


['Gabrielle_Giffords', 'Minnesota']

## Example 3

In [123]:
tags = getTags(data[130])
extractYago(tags)

You may be able to find this concept on wikipedia:
https://en.wikipedia.org/wiki/London
You may be able to find this concept on wikipedia:
https://en.wikipedia.org/wiki/Great_Britain
You may be able to find this concept on wikipedia:
https://en.wikipedia.org/wiki/Germany


['London', 'Great_Britain', 'Germany']