# <center> Multi-Lingual Translations v1.1</center>
---

###### <center>+ Summary & Comparisons </center>
---
<center><b>v1.0 Created by:<b></center></br>
<center>Brandon Gromala, Sr. Data Scientist (bgromala555)</center></br>
<center><b>v1.1 Created / Modified by:<b></center></br>
<center>Kevan White, Sr. Data Scientist (thyripian) </center></br>

<center>Release Date: 22 JUN 2023</center></br>

---
### <center> Imports and Setup</center>
---

In [None]:
import re
import torch
import pickle
import nltk
from nltk import ngrams
import pandas as pd
import numpy as np
from tqdm import tqdm
from IPython.display import Audio
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import BartTokenizer, BartForConditionalGeneration
from langdetect import detect

In [None]:
# Load data from pickle file
with open('D:\\exports\\LASER\\spacy_sent_tokenization.pickle','rb') as file:
    reconstructed_df = pickle.load(file)

# Load the CSV data into a dataframe
csv_df = pd.read_csv(r"D:\data\embeddings\source_data\gdelt_20230616_bn_tl_id_20.csv")

# Make sure 'UID' columns in both dataframes are of the same type (string)
reconstructed_df['uid'] = reconstructed_df['uid'].astype(str)
csv_df['uid'] = csv_df['uid'].astype(str)

# Merge the two dataframes on the 'uid'/'UID' column
reconstructed_df = pd.merge(reconstructed_df, csv_df, how='inner', left_on='uid', right_on='uid')

In [None]:
# Verify DataFrame was built correctly
reconstructed_df

In [None]:
def fetch_data_by_languages(df, languages):
    """
    Function to fetch rows from DataFrame based on the specified languages.
    Automatically detects the language codes in the upload data, and pulls
    all associated rows, by language.
    """
    fetched_df = pd.DataFrame()

    for lang in languages:
        lang_df = df[df['meta_body_language'] == lang]
        num_samples = len(lang_df)  # Get 5 samples or less if not available
        if num_samples > 0:
            sampled_lang_df = lang_df.sample(n=num_samples, random_state=1)
            fetched_df = pd.concat([fetched_df, sampled_lang_df])

    return fetched_df

# Pull unique items from language column
lang_codes = reconstructed_df['meta_body_language'].unique().tolist()

# Replace NaN values with None
lang_codes = [None if pd.isna(item) else item for item in lang_codes]

# Remove None values from the list
languages = [item for item in lang_codes if item is not None]

# Call the function with the DataFrame and the list of languages
fetched_df = fetch_data_by_languages(reconstructed_df, languages)


In [None]:
# Verify new DataFrame generation
fetched_df

# <center>*******************************************************************************************</center>
# <center>TRANSLATIONS</center>
# <center>*******************************************************************************************</center>
# Processing Option # 1 

---
### <center>Articles NOT Reconstructed</center>
---


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
model = model.to(device)

start = time.time()

with tqdm(total=len(fetched_df),ncols=80) as pbar:
    # Translate each sentence and add it to the 'translated_articles' column
    for i, row in fetched_df.iterrows():
        sentence = row['sentence']

        if isinstance(sentence, str):
            
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
            inputs = inputs.to(device)
            translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"])
            translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

            fetched_df.loc[i, 'translated_sentences'] = translated_sentence
        pbar.update(1)

print(fetched_df)


# <center>*******************************************************************************************</center>
# Processing Option # 2

---
### <center>Articles Reconstructed</center>
---

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B")
model = model.to(device)

start = time.time()

# Translate each sentence and add it to the 'translated_articles' column
for i, row in fetched_df.iterrows():
    article = row['sentence']

    if isinstance(article, str):
        article_sentences = nltk.sent_tokenize(article)
        translated_article = ''

        for sentence in article_sentences:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
            inputs = inputs.to(device)
            translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"])
            translated_sentence = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
            translated_article += translated_sentence + ' '

        fetched_df.loc[i, 'translated_articles'] = translated_article

print(fetched_df)


In [None]:
##### PLay audio alert when done processing (if tab is actively selected) #####

framerate = 44100
play_time_seconds = 1

# Change these to be higher for a higher pitch
frequency1 = 880  # was 220
frequency2 = 884  # was 224

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*frequency1*t) + np.sin(2*np.pi*frequency2*t)
Audio(audio_data, rate=framerate, autoplay=True)

# <center>*******************************************************************************************</center>
# <center>END OF TRANSLATIONS</center>
# <center>*******************************************************************************************</center>
---
### <center>Summary Generation</center>
---

In [None]:
# Load the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

def hierarchical_summarization(text):
    # Split the text into paragraphs
    paragraphs = text.split("\n\n")  # Adjust the paragraph separator as per your text format

    # Summarize each paragraph individually
    paragraph_summaries = []
    for paragraph in paragraphs:
        # Tokenize the paragraph
        inputs = tokenizer([paragraph], max_length=1024, truncation=True, return_tensors="pt")

        # Generate the summary
        summary_ids = model.generate(inputs["input_ids"], num_beams=4, length_penalty=2.0, max_length=150)
        paragraph_summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
        paragraph_summaries.append(paragraph_summary)

    # Generate the document-level summary by combining the paragraph-level summaries
    document_summary = " ".join(paragraph_summaries)

    return document_summary

# Iterate over the DataFrame
for i, row in fetched_df.iterrows():
    # Get the translated article
    translated_article = row['translated_articles']

    # Generate the hierarchical summary
    summary = hierarchical_summarization(translated_article)

    # Store the summary in the DataFrame
    fetched_df.loc[i, 'summary'] = summary

print(fetched_df)


---
### <center>Check Unique Values from Processing</center>
---

In [None]:
unique_items = fetched_df['translated_sentences'].unique()

print(f"Length of dataframe: {len(fetched_df['translated_sentences'])}")
print(f'Number of unique translations: {len(unique_items)}')

In [None]:
print(f"Number of non-unique translations: {len(fetched_df['translated_sentences'])-len(unique_items)}")

---
### <center>If Completely Duplicate Rows in DataFrame, Drop Them</center>
---

In [None]:
df = fetched_df.drop_duplicates()

In [None]:
df

---
### <center>Export Data</center>
---

In [None]:
with open('D:\\exports\\LASER\\additional_langs_cleaned_translations2.pickle','wb') as file:
    pickle.dump(fetched_df,file,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fetched_df.to_excel('D:\\exports\\LASER\\new_other_langs_translations-no_summary2.xlsx',index=False)

---
### <center>Reconstruct Exported Data for Data Validation</center>
---

In [None]:
with open('D:\\exports\\LASER\\additional_langs_cleaned_translations2.pickle','rb') as file:
    d1f=pickle.load(file)

In [None]:
df1