# <center> LASER EMBEDDINGS WITH API v1.2</center>
---

###### <center>+ LangDetect and LangId for sentence-level language detection and NLTK and Stanza for language-specific sentence tokenization.</center>
---
<center><b>Created by:<b></center></br>
<center>Kevan White, Sr. Data Scientist (thyripian)</center></br>
<center>Release Date: 22 JUN 2023</center></br>

---
#### <center>Notes:</center>
This python 'module' relies on the LASER_embeddings repository and associated Docker Image to run. Failure to clone the repository and build the Docker Image will result in failed attempts to process the data.</br>
If you have cloned the repository, but are unaware of how to build the Docker Image:</br>
- Install Docker Desktop (will require a system restart or log-out)
- Using a Bash terminal (such as GitBASH), navigate to the directory of the cloned repository containing the .dockerfile
- Run the command:   docker build -t LASER_embeddings -f LASER_embeddings.dockerfile .
- Once the Image is built, you may need to restart the computer and Docker Desktop.
- Reopen the Bash terminal and navigate back to the same directory.
- Run the command:   docker run -it -p 8080:80 --gpus 0 LASER_embeddings
- Once this last command is run, you can start and stop the Docker Image in the Docker Desktop GUI for all future use.

---
### <center> Imports and Setup</center>
---

In [None]:
import pandas as pd
import pickle
import re
import time
import unicodedata
from tqdm import tqdm
import stanza
import nltk
import requests
import json
import numpy as np 
from IPython.display import Audio
from collections import Counter
from langdetect import detect
from langid import classify
import torch
import math

In [None]:
# Check if CUDA is available
print(torch.cuda.is_available())

# Get the current GPU device
print(torch.cuda.current_device())

# Get the name of the current GPU
print(torch.cuda.get_device_name(torch.cuda.current_device()))

# Get the GPU memory usage
print(torch.cuda.memory_allocated())

# API call to use LASER3 with Ed's API.
def LASER_api_call(payload):
    url = 'http://localhost:8080/vectorize'
    
    headers = {
    'Content-Type':'application/json',
    'Accept':'*/*',
    'Accept-Encoding':'gzip,deflate,br',
    'Connection':'keep-alive'
    }
    
    # Convert payload to JSON
    json_payload = json.dumps(payload)
    response = requests.get(url,headers=headers,data=json_payload)
    
    # If good HTTP return, pull the content.
    if response.status_code == 200:
        content = response.content
        data = json.loads(content)
        return data
    
    # Other log error.
    else:
        try:
            error_msg = f'Error: {response.status_code}'
        except:
            # Added because there was a repeat issue for one of the TWN 
            # entries with the previous error_msg method.
            error_msg = 'Error with item. Skipping.' 
            
        return error_msg
        

In [None]:
# Import dataframe to process        
monday_df = pd.read_csv('D:\\data\\4025_METIS_embeddings\\source_data\\gdelt_20230616_bn_tl_id_20.csv')

# Insert \n after every sentence (may or may not be redundant at this point in time)
def insert_newline(text):
    sentences = text.split('. ')
    mod_text = '\n'.join(sentences)
    return mod_text

monday_df.loc[:,'norm_body'] = monday_df['norm_body'].apply(lambda x: insert_newline(x))

---
### <center>Auto-Detection of Languages to Downlaod</center>
###### <center>Based on the lanugage ID's listed in the dataset.</center>
---

In [None]:
# Pull unique items from language column
lang_codes = monday_df['meta_body_language'].unique().tolist()

# Replace NaN values with None
lang_codes = [None if pd.isna(item) else item for item in lang_codes]

# Remove None values from the list
lang_codes = [item for item in lang_codes if item is not None]

In [None]:
# Check output
lang_codes

---
### <center> Model Loading and Pipeline Generation</center>
---

In [None]:
# Download each of the models

for lang in lang_codes:
    try:
        stanza.download(lang)
    except:
        print('Language not found in Stanza holdings.')

In [None]:
# Set pipelines for all non-english articles 

stanza_pipelines={}
for lang in lang_codes:
    try:
        stanza_pipelines[lang] = stanza.Pipeline(lang,use_gpu=True)
    except:
        print(f'Unable to generate pipeline for {lang}')

In [None]:
# Uncomment and run if not already downloaded
#nltk.download('punkt')

# Set pipeline for english articles
en_nlp_pipe = nltk.tokenize

In [None]:
# Load the multi-language model in SpaCy
nlp = spacy.load("xx_ent_wiki_sm")
nlp.add_pipe('sentencizer')

# Load the English model in SpaCy
nlp_en = spacy.load("en_core_web_sm")
nlp_en.add_pipe('sentencizer')

---
### <center>Data Cleaning</center>
---


In [None]:
# Define precleaning functions

def remove_patterns_and_text(text):
    # List of exact text to remove
    texts_to_remove = [
        '- home- news- ecns wire- business- travel- photo- video- voices',
        '- homeopinionpd voicepoliticsforeign affairsbusinessworldwe are chinasocietyculturesci-techvideophotosportstravelmilitarylifeexclusivespecialslanguages- chinese- japanese- french- spanish- russian- arabic- korean- german- portuguese- kiswahili- italian- kazakh- thai- malay- greekarchive',
        '- portada- china- economÃ­a- mundo- iberoamÃ©rica- opiniÃ³n- ciencia-tecnologÃ­a- deportes- cultura- sociedad- viaje- fotosidiomas- chino- inglÃ©s- francÃ©s- ruso- espaÃ±ol- japonÃ©s- coreano- Ã¡rabe- alemÃ¡n- portuguÃ©s- italiano- kazajo- suajili- tailandÃ©s- malayo- griego- mÃ¡sdeportesfoto',
        'globalink |',
        'illustration: '
        '- HomeOpinionPD VoicePoliticsForeign AffairsBusinessWorldWe Are ChinaSocietyCultureSci-TechVideoPhotoSportsTravelMilitaryLifeExclusiveSpecialsLanguages\n- Chinese\n- Japanese\n- French\n- Spanish\n- Russian\n- Arabic\n- Korean\n- German\n- Portuguese\n- Kiswahili\n- Italian\n- Kazakh\n- Thai\n- Malay\n- Greek\nArchive',
        '- Home \
        - News\
        - Ecns Wire\
        - Business\
        - Travel\
        - Photo\
        - Video\
        - Voices',
    ]
    
    # List of regex patterns to remove
    patterns = [
        'Xinhua \| Updated: \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
        'Source: Xinhua\nEditor: huaxia\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
        r'\b\w+,\s\w+\s\d+\s\(\w+\)\s--', 
        r'.*china.org.cn\s\|\s.*à jour le\s\d{2}-\d{2}-\d{4}', 
        r'.*\|\supdated:\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', 
        r'source:\sxinhuaeditor:\shuaxia\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}',
        r'^\w+,\s\w+\s\d+,\s\d+', 
        r'^feature:\s',     
        r"^Feature:\s", 
        r"^Editor's Note:\s",
        r'Autor:\s',
        r'\(ecns\) --',
        r'\(foto: vcg\)',
        r'\(Foto: VCG\)'
    ]
    
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    for removal_text in texts_to_remove:
        text = text.replace(removal_text, '')
    return text.strip()
    
def remove_before_dashes(text):
    text = text.replace('---', '--')  # Replace '---' with '--'
    parts = text.split('--')
    if len(parts) > 1:
        return parts[1].strip()  # Return the part after '---'
    else:
        return text.strip()  # If '---' is not in the text, return the text as is
    
def remove_before_datetime(text):
    match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', text)
    if match:
        return text[match.end():].strip()  # Return the part after the datetime
    else:
        return text.strip()  # If no datetime is found, return the text as is
    
def remove_initial_caps_words(text):
    return re.sub(r'^[A-Z]{3,}[:|,]?', '', text).strip()

def remove_by_author(text):
    return re.sub(r'^(By|by)[^-]*-','',text).strip()

def remove_figure_captions(text):
    return re.sub(r'\d+/\d+\(.*?\)', '', text).strip()

def remove_before_keywords(text):
    return re.sub(r'.*(- MÃ¡sDeportesFotos|-másdeportesfotos)', '', text).strip()    

In [None]:
# Apply the cleaning functions
monday_df['norm_body'] = monday_df['norm_body'].apply(remove_patterns_and_text).apply(remove_before_dashes).apply(remove_before_datetime).apply(remove_initial_caps_words).apply(remove_by_author).apply(remove_figure_captions).apply(remove_before_keywords)

In [None]:
monday_df['norm_body'] = monday_df['norm_body'].str.replace('.', '. ')

In [None]:
# Reduce dataframe to only necessary columns
selected_columns = ['uid','meta_body_language','norm_body']
stripped_monday_df = monday_df[selected_columns]

# <center>*******************************************************************************************</center>
# <center>DATA PROCESSING</center>
# <center>*******************************************************************************************</center>
# Processing Option # 1

---
### <center>Sentence Tokenization, Sans Embeddings</center>
---


In [None]:
# Load your csv file into a pandas DataFrame
df = monday_df.copy()

# Prepare tqdm progress bar
max_iter = len(df)
with tqdm(total=max_iter, ncols=80) as pbar:
    
    # Initialize an empty list to hold new rows
    new_rows = []
    
    # Iterate through the DataFrame
    for index, row in df.iterrows():
        # for English
        if row['meta_body_language'] == 'en': 
            doc = nlp_en(row['norm_body'])
        # for multi-lingual
        else:  
            doc = nlp(row['norm_body'])

        # Tokenize the article into sentences
        sentences = [sent.text for sent in doc.sents]
        
        # Iterate through the sentences
        for sentence in sentences:
            # Create a new row with the sentence and the uid, and append it to new_rows
            new_rows.append({'uid': row['uid'], 'sentence': sentence})
        
        # Update progress bar
        pbar.update(1)

# Create a new DataFrame from the list of new rows
sentence_df = pd.DataFrame(new_rows)

# <center>*******************************************************************************************</center>
# Processing Option # 2

---
### <center>Sentence Tokenization WITH Embeddings & Attempted Language Identification</center>
---

In [None]:
# Run embeddings

error_dict = {}
embed_dict = {}


def send_sentences(sentences,embed_dict,lang_code):
    # Instatiate dictionary to store sentence embeddings with each loop iteration
    sent_embed_dict = {}

    for sentence in sentences:
        try:
            lang_detect = detect(sentence)
        except:
#             print(f'LangDetect could not identify language for sentence: {sentence}')
            lang_detect = lang_code

        try:
            lang_id, _ = langid.classify(sentence)
        except:
#             print(f'Langid could not identify langauge for sentence: {sentence}')
            lang_id = lang_code

        final_lang = Counter([lang_code,lang_detect,lang_id]).most_common(1)[0][0]
        # This is just to make sure that it is a string. Not really necessary, but I was tired
        # of stuff breaking, so I threw it in for good measure.
        if type(sentence) == str:

            # Declare the payload for the API call. Send one sentence at a time.
            payload = {
            'content':sentence,
            'lang':final_lang
            }

            # API CALL
            LASER_gen = LASER_api_call(payload)

            # Error response logging.
            if isinstance(LASER_gen, str) and LASER_gen.startswith('Error'):
                error_dict[uid] = LASER_gen
                continue
            else:
                # If not error response, store to sentence embedding dictionary.
                sent_embed_dict[sentence] = LASER_gen['embedding']
        embed_dict[uid] = sent_embed_dict

    return embed_dict

# Logging of time to track process. It can be quite lengthy.
start_time = time.time()

start_time_str = time.strftime('%H:%M:%S', time.localtime(start_time))
start_date_str = time.strftime('%Y-%m-%d', time.localtime(start_time))

print(f'STARTED ON {start_date_str} @ {start_time_str}')

#### ADDITIONAL DATA PRE-PROCESSING ####

# removes non-printable unicode characters
stripped_monday_df.loc[:,'norm_body'] = stripped_monday_df['norm_body'].apply(lambda text: ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C'))

# remove html artifacts
clean_re = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
stripped_monday_df.loc[:,'norm_body'] = stripped_monday_df['norm_body'].apply(lambda text: re.sub(clean_re, '', text))

# all text should be lowercase before being tokenized
stripped_monday_df.loc[:,'norm_body'] = stripped_monday_df['norm_body'].apply(lambda text: text.lower())


############################

# Set max iteration for progress bar. Set variable to pickle load var name.
max_iter = len(stripped_monday_df)

with tqdm(total=max_iter, ncols=80) as pbar:
    
    # Iterate through the dataframe
    for index, row in stripped_monday_df.iterrows():
            # Set variables for each cell of the current row iteration.
            text = row['norm_body']
            lang_code = row['meta_body_language']
            uid = row['uid']

            # Set delimiter based on language code.

            if lang_code in lang_codes:
                if lang_code == 'en':
                    sentences = en_nlp_pipe.sent_tokenize(text)
                    embed_dict = send_sentences(sentences,embed_dict,lang_code)

                else:
                    if lang_code in stanza_pipelines.keys():
                        Stanza_nlp_pipe = stanza_pipelines[lang_code]
                        doc = Stanza_nlp_pipe(text)
                        sentences = [sentence.text for sentence in doc.sentences]
                        embed_dict = send_sentences(sentences,embed_dict,lang_code)#.embed_dict
            else:
                try:
                    sentences = en_nlp_pipe.sent_tokenize(text)
                    embed_dict = send_sentences(sentences,embed_dict,lang_code)
                except:
                    pass

            pbar.update(1)

# Calculate total amount of time it took to process the dataframe.
total_time_seconds = time.time() - start_time
total_time_minutes, total_time_seconds = divmod(total_time_seconds,60)
total_time_hours, total_time_minutes = divmod(total_time_minutes,60)

current_time_str = time.strftime('%H:%M:%S', time.localtime())
current_date_str = time.strftime('%Y-%m-%d', time.localtime())

# Display for reference.
print(f'FINISHED ON {current_date_str} @ {current_time_str}\n')
print(f'*** TOTAL PROCESSING TIME: {int(total_time_hours):02d}:{int(total_time_minutes):02d}:{total_time_seconds:.2f} ***')


##### PLay audio alert when done processing (if tab is actively selected) #####

framerate = 44100
play_time_seconds = 1

# Change these to be higher for a higher pitch
frequency1 = 880  # was 220
frequency2 = 884  # was 224

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*frequency1*t) + np.sin(2*np.pi*frequency2*t)
Audio(audio_data, rate=framerate, autoplay=True)

# <center>*******************************************************************************************</center>
# <center>END OF DATA PROCESSING</center>
# <center>*******************************************************************************************</center>
---
### <center>Check Unique Values from Processing</center>
---

In [None]:
unique_rows = new_df.drop_duplicates().shape[0]

print(f"Length of dataframe: {len(new_df['sentence'])}")
print(f'Number of unique translations: {unique_rows}')

---
### <center>Export Data</center>
---

In [None]:
with open('D:\\exports\\LASER\\spacy_sent_tokenization.pickle','wb') as file:
    pickle.dump(new_df,file,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import torch
torch.cuda.empty_cache() 

In [None]:
# Export embedding dictionary as pickle. (Set to appropriate local directory.)
with open('D:\\exports\\LASER\\cleaner_from_thurs_monday-6700_embed_dict_NLTK-Stanza-GPU_langDetect_spaces_added.pickle', 'wb') as file:
    pickle.dump(embed_dict,file,protocol=pickle.HIGHEST_PROTOCOL)

---
### <center>Reconstruct DataFrame for Data Validation</center>
---

In [None]:
# Reconstruct dictionary into user-friendly dataframe
data=[]

for uid, sentences in embed_dict.items():
    
    for sentence,embed in sentences.items():
        
        data.append({'uid':uid,'sentence':sentence,'embed':embed})
        
reconstructed_df = pd.DataFrame(data)

In [None]:
# Run embeddings

error_dict = {}
embed_dict = {}


def send_sentences(sentences,embed_dict,lang_code):
    # Instatiate dictionary to store sentence embeddings with each loop iteration
    sent_embed_dict = {}

    for sentence in sentences:
        try:
            lang_detect = detect(sentence)
        except:
#             print(f'LangDetect could not identify language for sentence: {sentence}')
            lang_detect = lang_code

        try:
            lang_id, _ = langid.classify(sentence)
        except:
#             print(f'Langid could not identify langauge for sentence: {sentence}')
            lang_id = lang_code

        final_lang = Counter([lang_code,lang_detect,lang_id]).most_common(1)[0][0]
        # This is just to make sure that it is a string. Not really necessary, but I was tired
        # of stuff breaking, so I threw it in for good measure.
        if type(sentence) == str:

            # Declare the payload for the API call. Send one sentence at a time.
            payload = {
            'content':sentence,
            'lang':final_lang
            }

            # API CALL
            LASER_gen = LASER_api_call(payload)

            # Error response logging.
            if isinstance(LASER_gen, str) and LASER_gen.startswith('Error'):
                error_dict[uid] = LASER_gen
                continue
            else:
                # If not error response, store to sentence embedding dictionary.
                sent_embed_dict[sentence] = LASER_gen['embedding']
        embed_dict[uid] = sent_embed_dict

    return embed_dict

# Logging of time to track process. It can be quite lengthy.
start_time = time.time()

start_time_str = time.strftime('%H:%M:%S', time.localtime(start_time))
start_date_str = time.strftime('%Y-%m-%d', time.localtime(start_time))

print(f'STARTED ON {start_date_str} @ {start_time_str}')

#### DATA PRE-PROCESSING ####
     # Courtesy of Ed #

# removes non-printable unicode characters
stripped_monday_df.loc[:,'norm_body'] = stripped_monday_df['norm_body'].apply(lambda text: ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C'))

# remove html artifacts
clean_re = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
stripped_monday_df.loc[:,'norm_body'] = stripped_monday_df['norm_body'].apply(lambda text: re.sub(clean_re, '', text))

# all text should be lowercase before being tokenized
stripped_monday_df.loc[:,'norm_body'] = stripped_monday_df['norm_body'].apply(lambda text: text.lower())


############################

# Set max iteration for progress bar. Set variable to pickle load var name.
max_iter = len(stripped_monday_df)

with tqdm(total=max_iter, ncols=80) as pbar:
    
    # Iterate through the dataframe
    for index, row in stripped_monday_df.iterrows():
            # Set variables for each cell of the current row iteration.
            text = row['norm_body']
            lang_code = row['meta_body_language']
            uid = row['uid']

            # Set delimiter based on language code.

            if lang_code in lang_codes:
                if lang_code == 'en':
                    sentences = en_nlp_pipe.sent_tokenize(text)
                    embed_dict = send_sentences(sentences,embed_dict,lang_code)

                else:
                    if lang_code in stanza_pipelines.keys():
                        Stanza_nlp_pipe = stanza_pipelines[lang_code]
                        doc = Stanza_nlp_pipe(text)
                        sentences = [sentence.text for sentence in doc.sentences]
                        embed_dict = send_sentences(sentences,embed_dict,lang_code)#.embed_dict
            else:
                try:
                    sentences = en_nlp_pipe.sent_tokenize(text)
                    embed_dict = send_sentences(sentences,embed_dict,lang_code)
                except:
                    pass

            pbar.update(1)

# Calculate total amount of time it took to process the dataframe.
total_time_seconds = time.time() - start_time
total_time_minutes, total_time_seconds = divmod(total_time_seconds,60)
total_time_hours, total_time_minutes = divmod(total_time_minutes,60)

current_time_str = time.strftime('%H:%M:%S', time.localtime())
current_date_str = time.strftime('%Y-%m-%d', time.localtime())

# Display for reference.
print(f'FINISHED ON {current_date_str} @ {current_time_str}\n')
print(f'*** TOTAL PROCESSING TIME: {int(total_time_hours):02d}:{int(total_time_minutes):02d}:{total_time_seconds:.2f} ***')


##### PLay audio alert when done processing (if tab is actively selected) #####

# framerate = 44100
# play_time_seconds = 1

# # Change these to be higher for a higher pitch
# frequency1 = 880  # was 220
# frequency2 = 884  # was 224

# t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
# audio_data = np.sin(2*np.pi*frequency1*t) + np.sin(2*np.pi*frequency2*t)
# Audio(audio_data, rate=framerate, autoplay=True)
audio_alert()

In [None]:
reconstructed_df

In [None]:
# Save csv
reconstructed_df.to_csv('D:\\exports\\LASER\\cleaner_from_thurs_sentence_level_langDetect_and_indiv_tokenization_models.csv',index=False)