# Option 2, 
### Identify the main people, organisations, and sources mentioned on the Volkswagen emissions scandal

In [2]:
import nltk
import glob
import gzip
import codecs
import re
import sys 
import os
import json
from pycorenlp import StanfordCoreNLP
from nltk.tree import Tree

## Identify enities and their relations using corenlp tool

In this experiments, I will use corenlp tool for identifying entities and their relations in the text. Corenlp tool has traind models for entity recognition and KBP relation extraction. To do that, first we have to download the corenlnp tool from https://stanfordnlp.github.io/CoreNLP/download.html and then we can start the corenlp server in the terminal using following command

java -mx8g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators tokenize, ssplit, pos, lemma, parse, relation -port 9000 -timeout 30000


In following block, I have defined three functions to extract named enities and their relations from text using corenlp server.


In [6]:
nlp = StanfordCoreNLP('http://localhost:9000')

def get_token_ner(text):
	input_string = text  
	if isinstance(input_string, str):
		input_string = input_string.decode('ascii', 'ignore').encode('ascii')
	elif isinstance(input_string, unicode):
		input_string = input_string.encode('ascii', 'ignore')
	text = input_string

	output = nlp.annotate(text, properties={ 'annotators': 'ner, relation', 'outputFormat': 'json'} )

	tokens = [ t['originalText'] for i in range(len(output) ) for t in output["sentences"][i]["tokens"] ]		

	ner_list = [ (ner['text'], ner['ner'], ner['tokenBegin'], ner['tokenEnd'] ) for i in range(len(output) ) for ner in output['sentences'][i]['entitymentions']] 

	return tokens, ner_list


def get_kbp_relation(text):
    input_string = text  
    if isinstance(input_string, str):
        input_string = input_string.decode('ascii', 'ignore').encode('ascii')
    elif isinstance(input_string, unicode):
        input_string = input_string.encode('ascii', 'ignore')
    text = input_string
    
    output = nlp.annotate(text, properties={"annotators": "kbp", "outputFormat": "json",  })
    rel_list = [ (rel['subject'], rel['relation'], rel['object']) for rel in output['sentences'][0]['kbp'] ]
    return rel_list



def get_relation(text):

	input_string = text  
	if isinstance(input_string, str):
		input_string = input_string.decode('ascii', 'ignore').encode('ascii')
	elif isinstance(input_string, unicode):
		input_string = input_string.encode('ascii', 'ignore')
	text = input_string


	nlp=StanfordCoreNLP("http://localhost:9001/")
	output = nlp.annotate(text, properties={"annotators":"tokenize,ssplit,pos,depparse,natlog,openie", "outputFormat": "json","openie.triple.strict":"true", "openie.max_entailments_per_clause":"1"})
	result = [output["sentences"][0]["openie"] for item in output]

 
	relationSent = []
	for i in result:
		for rel in i:
			relationSent.append( (rel['relation'], rel['subject'], rel['object'] ) )

            
	return relationSent


# Filter the documents which are not relavent and extract  entities and relations on filtered texts

For this experiment, we consider all articles which present 'Volkswagen' and 'scandal' or 'emissions' in the text. In order to identify the main people and organisations involved in the scandal, we make entity recognition in each sentence of the filtered articles and all the sentences where at least one person name and one organizaton name are present we use relation extraction model predict relation on that sentence.


Following are the examples of few relation instance we obtained with the experiments;

Source:Malay Mail	 Rel:Martin Winterkorn||per:employee_or_member_of||VW	
Source:Malay Mail	 Rel:VW||org:top_members_employees||Martin Winterkorn	
Source:Malay Mail	 Rel:Christian Stadler||per:employee_or_member_of||Volkswagen	
Source:Charlotte Observer	 rel:Volkswagen||org:top_members_employees||Herbert Diess


Here, from source 'Malay Mail', we obatained that 'Martin Winterkorn' is a high-ranking-employee and 'Christian Stadler' is an employee of Valkswagen organization. 

Also source 'Charlotte Observer', written 'Herbert Diess' is an emplyee of 'Valkswagen' organization




In [None]:
# Input file
in_f = 'signalmedia-1m.jsonl.gz'
f_reader = gzip.open(in_f, 'rb') if in_f.endswith('.gz') else codecs.open(in_f, 'r', 'utf-8')

# Output files
fw_sent = codecs.open('text_for_rel.txt', 'w', 'utf-8')
fw_rel = codecs.open('kbp_results.txt', 'w', 'utf-8')

rel_res = []

for line in f_reader:
    
    d = json.loads(line)
    #print d['title'], d['media-type'], d['content'], d['source'], d['published'] , d['id']
    text = re.sub(' +',' ', d['content'])    
    sent_list = nltk.sent_tokenize(text)
    sent_list = [sent.strip() for sent in sent_list]  
    sent_list = [d['title']] + sent_list
    text = ' '.join(sent_list)
    
    if ('Volkswagen' in text or 'volkswagen' in text ) and ('scandal' in text or 'Scandal' in text or 'emissions' in text or 'Emissions' in text ) :
        
        for sent in sent_list:            
            sent = sent.encode('utf8')  
            sent = re.sub('\n', ' ', sent)
            if len( sent.split() ) < 100:
                token_list, ent_list = get_token_ner(sent)
                
                type_list = [ent[1] for ent in ent_list]
                mention_list = [ent[0] for ent in ent_list]
                
                if 'ORGANIZATION' in type_list and 'PERSON' in type_list :
                    fw_sent.write('%s\n'%(' '.join(token_list)))
                    
                    #get relation using openie tool
                    rel_list = get_kbp_relation(sent)
                    
                    if len(rel_list)>0:
                        for rel in rel_list:
                            #if 'Volkswagen' in rel or 'VW' in rel:
                            if rel not in rel_res:
                                    
                                    rel_res.append((rel[0], rel[1], rel[2]) )
                                    rel_res.append((rel[2], rel[1], rel[0]) )
                                    if 'Volkswagen' in rel or 'VW' in rel:
                                        fw_rel.write( '%s\t%s\t%s\n' %(rel[0], rel[1], rel[2] ) )
                                        print ('Source:%s\t Rel:%s\t' %(d['source'], '||'.join(rel) ) )
                    
                    
                    
    
     

Source:Malay Mail	 Rel:Martin Winterkorn||per:employee_or_member_of||VW	
Source:Malay Mail	 Rel:VW||org:top_members_employees||Martin Winterkorn	
Source:Malay Mail	 Rel:Christian Stadler||per:employee_or_member_of||Volkswagen	
Source:Charlotte Observer	 Rel:Volkswagen||org:top_members_employees||Herbert Diess	
Source:Charlotte Observer	 Rel:Herbert Diess||per:employee_or_member_of||Volkswagen	
Source:Charlotte Observer	 Rel:Volkswagen||org:top_members_employees||Winfried Vahland	
Source:Charlotte Observer	 Rel:Volkswagen||org:top_members_employees||Matthias Mueller	
Source:Charlotte Observer	 Rel:Matthias Mueller||per:employee_or_member_of||Volkswagen	
Source:Charlotte Observer	 Rel:Volkswagen||org:country_of_headquarters||U.S.	
Source:Charlotte Observer	 Rel:Volkswagen||org:top_members_employees||Mueller	
Source:TheStar.com.my	 Rel:Martin Winterkorn||per:employee_or_member_of||Volkswagen	
Source:TheStar.com.my	 Rel:Volkswagen||org:top_members_employees||Martin Winterkorn	
Source:ADVFN