In [None]:
import pickle
from google.cloud import language
import pandas as pd

### USAGE NOTES
* Ensure comments.csv, read_csv, analyze_text, and to_csv all using UTF-8
* Delete memo.pickle if modify values returned by func analyze_text

In [None]:
# Load dataset
df = pd.read_csv('data/comments.csv',
                 sep=',',
                 header=0,
                 encoding='utf-8',      # UTF-8 for FR
                 keep_default_na=False, # Prevent empty strings from being converted to NaN
                 nrows=5)
# Rename col 'stars' to 'user_stars' to keep but distinguish from new 'stars' assigned by API
df = df.rename(columns={'stars': 'user_stars'})

In [None]:
df.head()

In [None]:
# Load pickle for memoization
#with open('memo.pickle', 'rb') as f:
#    memo_dict = pickle.load(f)
memo_dict = {}

In [None]:
# Instantiate client
client = language.LanguageServiceClient()

def analyze_text(my_text, survey_id):
    """Pass sample to API and return tuple of shape (sentiment, magnitude).
    Memoize results using 'survey_id' as PKEY."""
    if survey_id in memo_dict:
        return memo_dict[survey_id]
    else:
        # Pass to API
        document = language.types.Document(content=my_text,
                                           type=language.enums.Document.Type.PLAIN_TEXT)
        sentiment = client.analyze_sentiment(document=document).document_sentiment
        # Adjust interval from [-1, 1] to [1, 5]
        # sentiment_score = (sentiment.score * 2) + 3
        sentiment_score = sentiment.score
        magnitude = sentiment.magnitude
        result = (sentiment_score, magnitude)
        # Memoize and return result
        memo_dict[survey_id] = result
        return result

In [None]:
api_results = df.apply(lambda x: analyze_text(x['text_answer'], x['survey_id']),
                       axis=1,               # i.e. to each row
                       raw=False,            # Pass each cell individually as not using NumPy
                       result_type='expand') # Return DataFrame rather than Series of tuples

In [None]:
df['stars'] = api_results[0]
df['magnitude'] = api_results[1]

In [None]:
df.head()

In [None]:
# Export memo_dict to pickle for future re-use
with open('memo.pickle', 'wb') as f:
    pickle.dump(memo_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Export CSV
df.to_csv('data/comments_ML.csv', sep=',', index=False, encoding='utf-8')

In [None]:
# fix the #VALUE! and #NAME! problem in Excel!!!
# decide how to round to nearest whole
# ensure utf-8 passed to API
# ENSURE PROJECT IS DEAD at end, not still billing

In [None]:
{
    "document":{
        "type":"PLAIN_TEXT",
        "content":"'Lawrence of Arabia' is a highly rated film."
    },
    "features":{
        "extractDocumentSentiment":true
    },
    "encodingType":"UTF8"
}