In [None]:
from decimal import Decimal
import pickle
from google.cloud import language
import pandas as pd

### USAGE NOTES
* Ensure comments.csv, read_csv, and to_csv all using UTF-8
* Set const LOAD_FROM_PICKLE (False if e.g. changing what funcs return)

In [None]:
LOAD_FROM_PICKLE = False
# Ignore categorical questions stored in same table
IGNORE_LIST = ['GCcampus Tools Used', 'OL Available', 'Prep', 'Technical Issues']

In [None]:
# Load dataset
df = pd.read_csv('data/comments.csv',
                 sep=',',
                 header=0,
                 encoding='utf-8',      # UTF-8 for FR
                 keep_default_na=False) # Prevent empty strings from being converted to NaN

In [None]:
# df.head()

In [None]:
# Fix French chars with ANSI encodings displayed as UTF-8
char_dict = {
    'Ã§': 'ç',
    'Ã‡': 'Ç',
    'Ã©': 'é',
    'Ã‰': 'É',
    'Ã ': 'à',
    'Ã¨': 'è',
    'Ã¬': 'ì',
    'Ã²': 'ò',
    'Ã¹': 'ù',
    'Ã€': 'À',
    'Ãˆ': 'È',
    'ÃŒ': 'Ì',
    'Ã’': 'Ò',
    'Ã™': 'Ù',
    'Ã¢': 'â',
    'Ãª': 'ê',
    'Ã®': 'î',
    'Ã´': 'ô',
    'Ã»': 'û',
    'Ã‚': 'Â',
    'ÃŠ': 'Ê',
    'ÃŽ': 'Î',
    'Ã”': 'Ô',
    'Ã›': 'Û',
    'Ã«': 'ë',
    'Ã¯': 'ï',
    'Ã¼': 'ü',
    'Ã‹': 'Ë',
    'Ã': 'Ï', # Yes, second char is non-displaying
    'Ãœ': 'Ü'
}
# Shouldn't be required for extracts from Cognos
# df.replace(to_replace=char_dict, value=None, inplace=True, regex=True)

In [None]:
# df.head()

In [None]:
# Load pickle for memoization
if LOAD_FROM_PICKLE:
    with open('memo.pickle', 'rb') as f:
        memo_dict = pickle.load(f)
else:
    memo_dict = {}

In [None]:
# Instantiate client
client = language.LanguageServiceClient()

# Counter to track progress with large datasets
ctr = 0

def analyze_text(survey_id, short_question, my_text, overall_satisfaction):
    """Pass sample to API and return tuple of shape (sentiment, magnitude).
    Memoize results using 'survey_id' as PKEY."""
    # Print ctr for every 1000 comments
    global ctr
    ctr += 1
    if ctr % 1000 == 0:
        print('Finished {0} comments!'.format(ctr))
    
    # Use composite key of survey_id.short_question
    pkey = '{0}.{1}'.format(survey_id, short_question)
    
    if short_question in IGNORE_LIST:
        result = ('\\N', '\\N') # i.e. NULL for MySQL
        # No need to memoize as no expensive computation performed
        return result
    
    # If already processed, returned memoized result to save compute
    if pkey in memo_dict:
        return memo_dict[pkey]
    
    # Otherwise, pass to API
    try:
        document = language.types.Document(content=my_text,
                                           type=language.enums.Document.Type.PLAIN_TEXT)
        sentiment = client.analyze_sentiment(document=document).document_sentiment
        # Adjust interval from [-1, 1] to [1, 5]
        # Cast to Decimal then back to int to prevent floating point rounding errors
        sentiment_score = int(round(Decimal(str((sentiment.score * 2) + 3))))
        magnitude = sentiment.magnitude
        result = (sentiment_score, magnitude)
    # Comments occasionally so badly written the API can't identify the language
    except Exception as e:
        print('Error {0} occurred on sample {1}'.format(e, ctr))
        # If can't process, use overall_satisfaction from elsewhere in survey
        result = (float(overall_satisfaction), '\\N')
    # Memoize and return result
    memo_dict[pkey] = result
    return result

In [None]:
# df.head()

In [None]:
%%time
api_results = df.apply(lambda x: analyze_text(x['survey_id'], x['short_question'], x['text_answer'], x['overall_satisfaction']),
                       axis=1,               # Apply to each row
                       raw=False,            # Pass each cell individually as not using NumPy
                       result_type='expand') # Return DataFrame rather than Series of tuples

In [None]:
df['stars'] = api_results[0]
df['magnitude'] = api_results[1]

In [None]:
# df.head()

In [None]:
# Export memo_dict to pickle for future re-use
with open('memo.pickle', 'wb') as f:
    pickle.dump(memo_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Export CSV; export locally as slow to write to USB
df.to_csv('data/comments_ML.csv', sep=',', index=False, encoding='utf-8')