In [1]:
import pandas as pd
from tqdm import tqdm
import pickle

from concurrent.futures import ThreadPoolExecutor

In [2]:
# Where the JSON files are located
source = '../data/source/newsclust.csv'
df_file_name = '../data/source/df-entities-sentiment.pickle.gz'
entities_file_name = '../data/entities.pickle'

In [3]:
df = pd.read_csv("../data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.set_index('Unnamed: 0', drop=True, verify_integrity=True)
df.index.names = ['id']
# Uncomment to run on a very small sample
df = df.head(50)

103262


In [4]:
from bias import SimplifiedBias

df['bias'] = df.apply(lambda row: SimplifiedBias.get_simplified_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0_level_0,date,site,text,title,url,bias
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2015-01-29T23:14:00.000+02:00,washingtonexaminer.com,Class action filed over United’s ‘low fare gua...,Class action filed over United’s ‘low fare gua...,http://www.washingtonexaminer.com/class-action...,1
1,2015-01-23T02:00:00.000+02:00,nydailynews.com,Jupiterimages/Getty Images/Goodshoot RF Snuggl...,Portland pro cuddler hosts ‘Cuddle Con’ on Val...,http://www.nydailynews.com/news/national/portl...,1


In [None]:
from google.cloud.gapic.language.v1beta2 import enums
from google.cloud.gapic.language.v1beta2 import language_service_client
from google.cloud.proto.language.v1beta2 import language_service_pb2
import six
import sys
import traceback

# taken from https://cloud.google.com/natural-language/docs/beta#analyze-entity-sentiment
def entity_sentiment_text(text):
    """Detects entity sentiment in the provided text."""
    language_client = language_service_client.LanguageServiceClient()
    document = language_service_pb2.Document()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    document.content = text.encode('utf-8')
    document.type = enums.Document.Type.PLAIN_TEXT

    encoding = enums.EncodingType.UTF32
    if sys.maxunicode == 65535:
        encoding = enums.EncodingType.UTF16

    result = language_client.analyze_entity_sentiment(
        document, encoding)

    # for entity in result.entities:
    #     print('Mentions: ')
    #    print(u'Name: "{}"'.format(entity.name))
    #     for mention in entity.mentions:
    #         print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
    #         print(u'  Content : {}'.format(mention.text.content))
    #         print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
    #         print(u'  Sentiment : {}'.format(mention.sentiment.score))
    #         print(u'  Type : {}'.format(mention.type))
    #     print(u'Salience: {}'.format(entity.salience))
    #     print(u'Sentiment: {}\n'.format(entity.sentiment))
        
    return result

from time import sleep
def get_entities_and_save_to_disk(pbar, row, out, error, ents):
    row_id = row[0]
    text = row[1]
    try:
        pbar.update(1)
        
        # Execute Google's API call
        entity = entity_sentiment_text(text)
        # Simulate an API call
        #entity = None; sleep(0.5)
        
        ents.append((row_id, entity, None))
        pickle.dump((row_id, entity, None), out)
    except:
        type_, value_, _ = sys.exc_info()
        error['count'] += 1
        
        ents.append((row_id, None, (type_, value_)))
        pickle.dump((row_id, None, (type_, value_)), out)
        pbar.set_postfix(error_count=error['count'])

def parallel_get_and_save_entities(df, ents):
    error = {'count': 0}
    with open(entities_file_name, 'wb') as out:
        with tqdm(total=len(df)) as pbar:
            with ThreadPoolExecutor(max_workers=9) as pool:
                id_and_text = zip(df.index.values, df['text'])
                list(pool.map(lambda row: get_entities_and_save_to_disk(pbar, row, out, error, ents), id_and_text))

In [None]:
ents = []
# Running this code costs around US 300 - US $400
parallel_get_and_save_entities(df, ents)
print(len(ents))

 98%|█████████▊| 49/50 [00:11<00:00,  4.60it/s, error_count=40]

In [None]:
def read_entities_from_disk():
    my_objects = []
    error_count = 0
    try:
        with open(entities_file_name, 'rb') as infile:
            while True:
                row = pickle.load(infile)
                my_objects.append(row)
                    
    except EOFError:
        pass
    return my_objects

In [None]:
entities = read_entities_from_disk()
print(len(entities))
entities[-1]

In [None]:
df['entities'] = list(map(lambda row: row[1], entities))

In [None]:
df.head()

In [None]:
df.to_pickle(df_file_name, compression="gzip")
df2 = pd.read_pickle(df_file_name)

In [None]:
df2['entities'][0].entities[0].name