In [None]:
import os
import pandas as pd
import numpy as np
import datetime
import yaml
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import datetime
from datetime import datetime
from datetime import date

import nltk
from nltk import word_tokenize

# Google Cloud Language Translation API
# We're using the basic version here == "v2" 
from google.cloud import translate_v2

import timeit

In [None]:
#This was amazingly helpful https://www.youtube.com/watch?v=YapTts_An9A 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'tt-translate-2023396507e3.json'

# 1. Create function for performing the translations

In [None]:
def google_translate_messages(one_row):
    '''
    Pass in a df row.
    Find the message under the 'content column'
    '''
    one_message = one_row.loc['content']
    
    # initialize the Google Cloud translation client
    translate_client = translate_v2.Client()
    
    # set the target language
    target = 'en'
    
    
    try:
    # apply the translation 
        output = translate_client.translate(one_message, 
                                            format_='html',
                                            target_language=target)
    except:
        output = {'translatedText': 'translation_error', 
                  'detectedSourceLanguage': 'translation_error', 
                  'orig_content':'translation_error'}

    return list(output.values())

# 2. Prepare dataset for translation

In [None]:
# All parent-school messages, output from Script 06_no_ra_status_osse_merge
all_msgs = pd.read_pickle('../data/analysis_data/messages_w_demographics_osse6_schools_pickle.pkl')
all_msgs.shape

## 2.1. Create id for deduplicated content (without removing personalization) 

In [None]:
# Create an id for de-duplicated content with no personalization
new_id_creation = all_msgs[['content']].drop_duplicates()
new_id_creation.shape


new_id_creation['id_content_deduped_no_personalization'] = \
    ["id_" + str(i) for i in np.arange(1, new_id_creation.shape[0]+1).tolist()]

all_msgs_new_id = pd.merge(all_msgs, 
                           new_id_creation, 
                           on = "content",
                           how = "left")

## 2.2. Cut down the number of characters we'll run through the translation function

We care about this because the Google Cloud pricing structure is the same for language detection and language translation itself. Since language translation will return the language detected, we'd only want to run through it once

In [None]:
# count N chars in text messages. We care about this bc Google Cloud pricing structure is based on N chars
all_msgs_new_id['content_len'] = all_msgs_new_id.content.str.len()

In [None]:
deduped_msgs = all_msgs_new_id[['id_content_deduped_no_personalization', 
                                'content', 'content_len']].drop_duplicates()

deduped_msgs.shape

# len of characters 
# 16.5million characters might be too much, so we'll cut it down
deduped_msgs.content_len.sum()

In [None]:
# common phrases to look for in our text message to cut down on messages to translate

common_phrases = ['no problem', 'No problem',
                  'Hello', 'thank you', 'Thank you', 'Thank You', 'Thanks', 'thanks',
                  "You're welcome", "You are very welcome",
                  'English teacher', 
                  'attendance', 
                  'was absent from',
                  'was not in class',
                  'was late to',
                  'I will let you know',
                  'Please reply to this message',
                  'Good Afternoon', 'Good afternoon', 'good afternoon', 
                  'Good Morning', 'Good morning', 'good morning',
                  'Please provide', 'Please respond', 'please check out', 'please contact me',
                  'assignment',
                  'detention', 'suspended',
                  'Good Evening', 'Good evening',
                  'Please make sure', 'Please be sure', 'Please send',
                  'Please Join',
                  "That's great",
                  'hall sweep', 'The message was',
                  'Parent teacher', 'parent teacher', 'parent-teacher',
                  'parent/teacher', 'parent/teacher',
                  'conferences', 'Conferences', "That's awesome",
                  'failing', 'fail',
                  'I am sending this message to inform ',
                  'This message is from',
                  'I want to say thank you for your support this first week of distance-learning',
                  'did not turn in', 
                  'First, I would like to say we made it','Early Release Day','Uber','Idgaf', 'presentation',
                  'Let us have a great week',
                  'Wishing you', 
                  'Dear Parent']

In [None]:
# Join the common phrases
# If phrase in content, give the message a 0, else, leave blank
deduped_msgs['non_english'] = np.where(deduped_msgs.content.str.contains(('|').join(common_phrases)),
                                       0, 
                                       '')

deduped_msgs.shape

deduped_msgs.non_english.value_counts()

In [None]:
# There is one parent that uses Spanish primarily but includes "Thanks" in text, so false positive
# Manually add this back by looking for the id
deduped_msgs_parent_check = pd.merge(deduped_msgs, 
                                     all_msgs_new_id[['id_content_deduped_no_personalization',
                                                      'StudentID', 
                                                      'broad_type']], 
                                    how = 'left', 
                                    on = 'id_content_deduped_no_personalization')

deduped_msgs_parent_check['non_english'] = \
    np.where((deduped_msgs_parent_check.StudentID == 9209061) & \
             (deduped_msgs_parent_check.broad_type == 'parent_sent'), 
                                                    '', 
                                                    deduped_msgs_parent_check.non_english)

deduped_msgs_parent_check.shape

In [None]:
# Wrong N rows , so need to go through the process of dropping things again
deduped_msgs_rm_enes = deduped_msgs_parent_check[['id_content_deduped_no_personalization', 
                                                  'content', 'content_len', 'non_english']]\
                       .drop_duplicates()

deduped_msgs_rm_enes.shape

deduped_msgs_rm_enes.non_english.value_counts()

print('N chars to run through translator:', 
      deduped_msgs_rm_enes[deduped_msgs_rm_enes.non_english != '0'].content_len.sum())

# 3. Run the messages that did not have common phrases above through the translation function

In [None]:
# Subset df to the ones that are non_english. English == 0
df_to_translate = deduped_msgs_rm_enes[deduped_msgs_rm_enes.non_english != '0'].copy()
df_to_translate.shape

# Split df into 20 dataframes, so that we can revisit if code breaks + internet crashes 
split_df = np.array_split(df_to_translate, 20)

# N rows/columns per df. 
for i in range(len(split_df)):
    print('df', i+1, ':', 
        split_df[i].shape)

In [None]:
# Set up to run translation

# output path
path = '../data/gcloud_translation_results/'
filename = 'translated_msgs_'
ext = '.pkl'


# For every dataframe in split_df, 
# run through the translation, unpack the results, 
# and save as pickles

for i in range(len(split_df)):
    one_df = split_df[i]
    
    start_translation_time = timeit.default_timer() #time start
    
    # run translation
    one_df['output_list'] = one_df.apply(google_translate_messages, axis = 1)

    stop_translation_time = timeit.default_timer() #time end
    
    time_lapse = stop_translation_time - start_translation_time
    print("took " + str(time_lapse) + " seconds to run")
    
    # unpack the translation results into their own columns
    one_df[['translatedText', 'detectedSourceLanguage', 'orig_content']] = \
        pd.DataFrame(one_df.output_list.to_list(),  
                     index = one_df.index)

    one_df.to_pickle(path + filename + str(i) + ext)
    
    print("wrote results for df ", i + 1)

# 4. Combine msgs back together

## 4.1. Read in the translated pickles and stitch back together

In [None]:
# output path
path = '../data/gcloud_translation_results/'
filename = 'translated_msgs_'
ext = '.pkl'

# init first df
translated_msgs_init = pd.read_pickle(path + filename + '0' + ext)
print(translated_msgs_init.shape)

In [None]:
# initialize list to store our pickles
pickles = []

# For every file in the folder, read it in as a dataframe, 
# then append to the pickles list
for i in range(1, len(split_df)):
    df = pd.read_pickle(path + filename + str(i) + ext)
    print(df.shape)
    pickles.append(df)

In [None]:
# Append all the dataframes back together
translated_msgs = pd.concat([translated_msgs_init, *pickles])

print('Does the shape of the new dataframe match the one pre-translation? ')
translated_msgs.shape[0] == df_to_translate.shape[0]

In [None]:
# Look at the languages detected
translated_msgs.detectedSourceLanguage.value_counts()

In [None]:
translated_msgs.detectedSourceLanguage.unique()

In [None]:
# Explore a couple of messages
translated_msgs[translated_msgs.detectedSourceLanguage == 'es'].sample(n=10)

## 4.2. Add the translated text back to the non-translated deduped messages

In [None]:
# Grab the english ones in our original based on the common phrases
deduped_msgs_en = deduped_msgs_rm_enes[deduped_msgs_rm_enes.non_english=='0'].copy()


deduped_msgs_w_translation = deduped_msgs_en.append(translated_msgs, ignore_index = True)

# In the old script (non-Google Cloud API), if we translated an English text with incorrect spelling, the 
# translation would do spell-check/correct. Looks like this isn't the case here
deduped_msgs_w_translation[(deduped_msgs_w_translation.detectedSourceLanguage == 'en') &
                           (deduped_msgs_w_translation.orig_content != deduped_msgs_w_translation.translatedText)]

## 4.3. Clean up deduped messages

In [None]:
# Remove the columns we don't need
deduped_msgs_w_translation.drop(columns = ['content_len', 'output_list', 'orig_content'], inplace = True)

In [None]:
import html2text

def fix_html(one_row):
    '''
    This function takes in a row and removes the 
    html tags from the translated messages.
    
    Example input:  Hello, I am Ms. Johnson. Vicky&#39;s teacher
    Example output: Hello, I am Ms. Johnson. Vicky's teacher
    '''
    
    message = one_row.loc['translatedText']
    
    html_conv = html2text.HTML2Text()
    
    try:
        converted_msg = html_conv.handle(message)
    except:
        # if errors, e.g. with phone numbers, just go with the original message
        converted_msg = message
    
    return converted_msg

In [None]:
# Apply the function to remove the tags
deduped_msgs_w_translation['translatedText_rm_html_init'] = deduped_msgs_w_translation.apply(fix_html, axis = 1)

# the function adds '\n\n' so remove thatfrom the text
deduped_msgs_w_translation['translatedText_rm_html'] = deduped_msgs_w_translation.translatedText_rm_html_init\
                                                       .str.strip('\n\n')

# check results
deduped_msgs_w_translation[~deduped_msgs_w_translation.translatedText.isna()].head()

deduped_msgs_w_translation.drop(columns= ['translatedText_rm_html_init'],
                                inplace = True)

In [None]:
# flag for whether the message was translated or not 
# for the english ones that were run through the translator, we'll keep that as no.
deduped_msgs_w_translation['translated'] = np.where((~deduped_msgs_w_translation.translatedText.isna()) &\
                                                    (deduped_msgs_w_translation.detectedSourceLanguage != 'en'),
                                                    1, 0
                                                    )

# new column with either the original content or the translated content
deduped_msgs_w_translation['content_w_translation'] = np.where(deduped_msgs_w_translation.translated == 0, 
                                                               deduped_msgs_w_translation.content, 
                                                               deduped_msgs_w_translation.translatedText_rm_html)


deduped_msgs_w_translation['run_thru_translate'] = np.where(deduped_msgs_w_translation.non_english == '0', 
                                                            0, 1)

In [None]:
deduped_msgs_w_translation[deduped_msgs_w_translation.translated==1].sample(n=10) 

## 4.4. Add back to the original set of messages w demographics

In [None]:
all_msgs_new_id.shape

In [None]:
all_msgs_new_id.head()

In [None]:
all_msgs_wtranslation = pd.merge(all_msgs_new_id, 
                                 deduped_msgs_w_translation.drop(columns = ['content', 'non_english']), 
                                 how = 'left', 
                                 on = 'id_content_deduped_no_personalization')

all_msgs_wtranslation.shape

In [None]:
all_msgs_wtranslation.head()

In [None]:
#all_msgs_wtranslation.to_pickle('../data/analysis_data/msgs_wdem_wtrans_1124.pkl')

all_msgs_wtranslation.to_pickle('../data/analysis_data/msgs_wdem_wtrans_1203.pkl')