In [3]:
!pip3 install --upgrade google-cloud-language

Collecting google-cloud-language
  Downloading google_cloud_language-2.8.1-py2.py3-none-any.whl (88 kB)
     ---------------------------------------- 88.4/88.4 kB 1.0 MB/s eta 0:00:00
Installing collected packages: google-cloud-language
Successfully installed google-cloud-language-2.8.1


In [9]:
import os
import numpy as np
import pandas as pd
from google.cloud import language_v1
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Google Natural Language API

In [None]:
NL_SERVICE_ACCOUNT_JSON='celestial-digit-0000000000.json'

In [6]:
PATH='../../data/olist'
df = pd.read_parquet(os.path.join(PATH, 'olist_order_reviews_dataset.parquet'))
print(df.shape)
df.head(2)

(99224, 7)


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13


In [10]:
df['C']=df.progress_apply(lambda x: ("" if pd.isna(x['review_comment_title']) else str(x['review_comment_title']) + '. ')\
                                 + ("" if pd.isna(x['review_comment_message']) else str(x['review_comment_message'])), axis=1)

  0%|          | 0/99224 [00:00<?, ?it/s]

In [11]:
count_reviews=df.apply(lambda x: 0 if x['C']=="" else 1, axis=1).sum()
print('Count reviews to send to API: ', count_reviews)
print('Forecast cost of processing, usd: ', (count_reviews-5000)/1000*2.5)

Count reviews to send to API:  42706
Forecast cost of processing, usd:  94.26500000000001


In [None]:
def analyze_text_sentiment(text):
    try:
        client = language_v1.LanguageServiceClient.from_service_account_json(NL_SERVICE_ACCOUNT_JSON)
        document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
        response = client.analyze_sentiment(document=document)
        sentiment = response.document_sentiment
        return sentiment.score, sentiment.magnitude
    except Exception:
        print('Error in def sentiment')  
        return 'No', 'No'

In [None]:
def analyze_text_entities(text):
    try:
        client = language_v1.LanguageServiceClient.from_service_account_json(NL_SERVICE_ACCOUNT_JSON)
        document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
        response = client.analyze_entities(document=document)
        sss=[]
        for entity in response.entities:
            results = dict(
                name=entity.name,
                type=entity.type_.name,
                salience=entity.salience,
                wikipedia_url=entity.metadata.get("wikipedia_url", "-"),
                mid=entity.metadata.get("mid", "-"),
            )
            sss.append(results)

        return sss
    except Exception:
        print('Error in def entities')  
        return 'No'

In [None]:
def analyze_text_syntax(text):
    try:
        client = language_v1.LanguageServiceClient.from_service_account_json(NL_SERVICE_ACCOUNT_JSON)
        document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)

        response = client.analyze_syntax(document=document)

        sent_count=len(response.sentences)
        token_count=len(response.tokens)
        sentlist=[]
        for sentence in response.sentences:
            sentlist.append(sentence.text.content)
        tokenlist=[]    
        for token in response.tokens:
            results = dict(
                token_text=token.text.content,
                token_label=token.dependency_edge.label.name,
                token_head_index=token.dependency_edge.head_token_index,
                token_tag=token.part_of_speech.tag.name,
                token_gender=token.part_of_speech.gender.name,
                token_number=token.part_of_speech.number.name,
                token_proper=token.part_of_speech.proper.name,
                token_lemma=token.lemma,
            )
            tokenlist.append(results)    
        return sent_count, token_count, sentlist, tokenlist
    except Exception:
        print('Error in def syntax')  
        return 'No', 'No','No','No'

In [None]:
def get_text_for_nlp(message):
    if len(message)<2:
        return 0
    else:
        sent_score, sent_magnitude=analyze_text_sentiment(message) # two values 
        entities_list=analyze_text_entities(message) # list of dicts
        sent_count, token_count, sentlist, tokenlist =analyze_text_syntax(message) 
        return sent_score, sent_magnitude, entities_list, sent_count, token_count, sentlist, tokenlist

In [None]:
# dataset divide into very small sections to be sent for processing and each save to file
df['tmp']=np.nan
for i in tqdm(range(1,100)):    
    print('from ',(i-1)*1000, ' to ',i*1000)
    print(df['review_id'][(i-1)*1000],'---',df['review_id'][i*1000],'---',df['review_id'][(i-1)*1000:i*1000].shape)
    df['tmp'][(i-1)*1000:i*1000] = df['C'][(i-1)*1000:i*1000].progress_apply(get_text_for_nlp)
    df5=df[['review_id','order_id','tmp']]
    filename1='tmp'+str(i)+'.csv'
    df5[(i-1)*1000:i*1000].to_csv(os.path.join(PATH, filename1), sep=';', header=True, index=False)

In [None]:
filename1='tmp_all.csv'
df5.to_csv(os.path.join(PATH, filename1), sep=';', header=True, index=False)

In [None]:
df3=df.tmp.progress_apply(pd.Series) 
df3.rename(columns={0:'sent_score',1:'sent_magnitude', 2:'entities_list',3:'sentences_count',4:'token_count',5:'sentlist',6:'tokenlist'},  inplace=True)
df4=pd.concat([df,df3],ignore_index=False, axis=1)