# Preliminary API

In [5]:
import TWB
import glob
import json

## Load data

In [6]:
# Make sure that the tags are also wikipedia articles!!!

with open('data/tags.json') as ifd:
    tags = json.load(ifd)
#ewith

api_key = '34e73062364d41b180a6f6a9625bff79'

## Load the API

In [7]:
data_location = '/Users/thiesgehrmann/Downloads/TWB/documents/hackathon-for-good-2019_TWB-challenge_files/*.sdlxliff'
xliff_files = glob.glob(data_location)
xliff = [ TWB.XLIFF(f) for f in xliff_files[:100]]

In [8]:
api = TWB.API(tags, api_key)

## Get data for xliffs from the API

In [9]:
X = [ api.get(x) for x in xliff[:20] ]

In [13]:
trends_tags = ['disability', 
'gender', 
'genital mutilation', 
'humanitarian crisis', 
'natural disaster', 
'racism']
for topic in trends_tags:
    trend = TWB.Trends(None, topic=topic)
    print(topic)
    trend.preload()

disability
gender
genital mutilation
humanitarian crisis
natural disaster
racism


In [35]:
import TWB

from scipy.spatial.distance import cdist

class API(object):
    def __init__(self, tags, news_api_key):
        self.lsc = TWB.LSC()
        self.W = TWB.Wikipedia(tags)
        
        T = [ a.text() for a in self.W.articles ]
        

        self.trends = { t: TWB.Trends(None, topic=t) for t in tags }
        
        self.N  = TWB.News(tags, news_api_key)
        self.TC = self.N.topic_countries
        
        self.D = TWB.Dictionary(T)
        self.A = self.D.annotate(T)
    #edef
    
    def get(self, xliff, timestamp, max_topics=10):
        """
        
        Get information about an xliff object
        
        parameters:
        -----------
        xliff: XLIFF
        
        max_topics: Integer
            Maximum number of topics to report (sorted by prevalence)
        
        returns:
        dict
        """
        
        if not isinstance(xliff, TWB.XLIFF):
            raise valueError('Expecting a single XLIFF.')
        #fi
        
        XA =  { 'target_lsc' : self.lsc.detect(xliff.target_lang),
                 'source_lsc' : self.lsc.detect(xliff.source_lang) } 
        
        XA['topics'] = self._get_topics_languages(xliff, timestamp, max_topics)
        
        #Your other annotations
        #XA['o_annot'] = self._other_annot(xliffs)

        
        return XA
    #edef
    
    def _get_topics_languages(self, xliff, timestamp, max_topics=10):
        """
        Get the topic and target language relevance for an xliff object
        A bit messy for now...
        
        parameters:
        -----------
        xliffs: XLIFF
            List of xliffs
        
        max_topics: Integer
            Maximum number of topics to report (sorted by prevalence)
        
        returns:
        dict
        """
        
        xliffs = [ xliff ]
        
        doc_text  = [ ' '.join(x.source) for x in xliffs ]

        dists = cdist(self.D.annotate(doc_text), self.A)
        
        articles = self.W.articles

        xliff_topics = []

        for xliff_i, d_i in enumerate(dists):
            S = sorted(enumerate(d_i), key=lambda x:x[1])
            S = [ (articles[i].topic, d) for (i,d) in S ]
            T = { t: min(v) for (t,v) in TWB.common.group(S, key=lambda x: x[0], value=lambda x:x[1]).items()}
            T = dict(list(sorted(T.items(), key=lambda x:x[1]))[:max_topics])
            xliff_topics.append(T)
        #efor

        xliff_annot = []
        for xliff_i, xt, in enumerate(xliff_topics):
            target_lang = self.lsc.detect(xliffs[xliff_i].target_lang)
            source_lang = self.lsc.detect(xliffs[xliff_i].source_lang)
            A = { t : { 'distance' : d,
                        'news_country_languages' : {
                            country : {
                              'frequency' : count,
                              'target_rel' : target_lang.language.iso3 in self.lsc.country_languages(country),
                              'source_rel' : source_lang.language.iso3 in self.lsc.country_languages(country)
                            }
                            for (country, count) in self.TC[t]
                            if (target_lang.language.iso3 in self.lsc.country_languages(country)) # or (source_lang.language.iso3 in self.lsc.country_languages(country))
                        },
                        'trends_country_languages' : {
                            country : {
                              'frequency' : count,
                              'target_rel' : target_lang.language.iso3 in self.lsc.country_languages(country),
                              'source_rel' : source_lang.language.iso3 in self.lsc.country_languages(country)
                            }
                            for (country, count) in list(dict(self.trends[t].ranked_countries_per_topic(timestamp)[:10]).items())
                            if (target_lang.language.iso3 in self.lsc.country_languages(country)) # or (source_lang.language.iso3 in self.lsc.country_languages(country))
                        }
                      }
                 for (t,d) in xt.items()
                }
            
            xliff_annot.append(A)
        #efor
        return xliff_annot[0]
    #edef
#eclass

In [27]:
api = API(tags, api_key)

In [34]:
#api.get(xliff[10], '2014-06')
[ list(dict(api.trends[t].ranked_countries_per_topic('2014-06')[:10]).items()) for t in tags]

[[('HND', 41.0),
  ('IRQ', 13.0),
  ('MMR', 4.0),
  ('SMR', 2.0),
  ('YEM', 0.0),
  ('CYM', 0.0),
  ('ESH', 0.0),
  ('GIB', 0.0),
  ('GRC', 0.0),
  ('GUY', 0.0)],
 [('IND', 64.0),
  ('SRB', 52.0),
  ('ZAF', 13.0),
  ('PHL', 11.0),
  ('NGA', 10.0),
  ('AUS', 7.0),
  ('MEX', 6.0),
  ('MMR', 6.0),
  ('JPN', 5.0),
  ('CAN', 4.0)],
 [('AFG', 0),
  ('ALA', 0),
  ('ALB', 0),
  ('DZA', 0),
  ('ASM', 0),
  ('AND', 0),
  ('AGO', 0),
  ('AIA', 0),
  ('ATA', 0),
  ('ATG', 0)],
 [('IND', 48.0),
  ('AUS', 48.0),
  ('CAN', 38.0),
  ('GEO', 36.0),
  ('SGP', 34.0),
  ('BRA', 32.0),
  ('IRL', 27.0),
  ('ZAF', 25.0),
  ('KEN', 24.0),
  ('PHL', 24.0)],
 [('IND', 45.0),
  ('KEN', 45.0),
  ('NGA', 45.0),
  ('BGD', 38.0),
  ('PAK', 34.0),
  ('PHL', 33.0),
  ('SGP', 28.0),
  ('GIN', 25.0),
  ('IDN', 25.0),
  ('AUS', 24.0)],
 [('SWE', 24.0),
  ('IND', 18.0),
  ('SOM', 10.0),
  ('NGA', 4.0),
  ('EGY', 3.0),
  ('SWZ', 0.0),
  ('SLB', 0.0),
  ('ISL', 0.0),
  ('IRQ', 0.0),
  ('IRN', 0.0)],
 [('BRA', 51.0),
  ('AUT

In [10]:
tags

['humanitarian crisis',
 'natural disaster',
 'environmental crisis',
 'disability',
 'gender',
 'genital mutilation',
 'racism',
 'genocide',
 'civil war',
 'terrorism',
 'infectious disease',
 'political revolution',
 'political prisoner',
 'amnesty',
 'corruption',
 'health',
 'gender inequality',
 'rape',
 'ebola',
 'aids',
 'hiv',
 'technology',
 'artificial intelligence',
 'military',
 'war',
 'climate change',
 'starvation',
 'food shortage',
 'dehydration',
 'water shortage',
 'attack',
 'aggression',
 'logistics',
 'nutrition',
 'protection',
 'shelter',
 'drinking water',
 'sanitation',
 'hygiene',
 'refugee camp',
 'education',
 'emergency communication system',
 'food security',
 'human rights',
 'children',
 'pregnancy',
 'old age',
 'justice',
 'law',
 'maintenance (technical)',
 'homelessness',
 'art',
 'culture',
 'indigenous peoples',
 'police brutality',
 'prisoner abuse',
 'cultural heritage',
 'sexual slavery',
 'child sexual abuse',
 'exploitation of labour',
 'hum

In [30]:
import pandas as pd
pd.DataFrame(dict(x=range(10), y=range(10)))

Unnamed: 0,x,y
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6
7,7,7
8,8,8
9,9,9
