### Package installs 

In [None]:
pip install googletrans==3.1.0a0

In [None]:
pip install google

In [None]:
pip install boilerpy3

### Function

In [13]:
def translate_func(list_of_keywords, website, language):
    # Purpose: 1. Takes a list of keywords and translates them into the target laguage.
            #  2. Searches news sites for the top 10 news articles that contain those words.
            #  3. Translates those articles from target language to English and places the translated text into a df.
    # Inputs: list_of_keywords - All keywords that we would like to be in the news articles. Must be in a list. 
            # website - The target website (website.com is an acceptable format, no www. needed)
            # language - Target language to search in
    # Outputs: 1. A dataframe containing the url, original title, translated title, original text, and original text of each article
            #  2. A list of urls that had an error and could not be scraped or translated
            
    from googlesearch import search
    from boilerpy3 import extractors
    import re
    from googletrans import Translator, constants
    import googletrans
    import pandas as pd
    
    # Init translator
    translator = Translator()
    
    # Create dataframe to hold function output (translated texts)
    url_df = pd.DataFrame(columns= ["url","original_title","translated_title","original_text","translated_text"])
    
    # Create list to store urls that raised errors 
    errors = []  
    
    # Get language code from google translate languages list
    for key, value in googletrans.LANGUAGES.items():
        if value == language:
            dest_lang = key
    
    # Put keywords into google translate and store translated words in list
    translated_keywords = []
    for word in list_of_keywords:
        translated_word = translator.translate(word, dest= dest_lang)
        translated_keywords.append(translated_word.text)
    
    # Formatting keywords and website so that the google api can read them properly
    keywords = ",".join(translated_keywords)
    
    site = ["site:",website]
    site = "".join(site)
    site = "'{}'".format(site)
    
    search_phrase = keywords + " " + site
        
    # Search website for all articles with keyword match, if the article cannot be extracted, add it to the error list
    for url in search(search_phrase, stop=10):
        extractor = extractors.ArticleExtractor()
        try:
            doc = extractor.get_doc_from_url(url)
        except:
            errors.append(url)
            continue
        else:
            content = doc.content.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('\f', ' ').replace('\v', ' ')
            content = re.sub(r'\s+', ' ', content) 
            
            # Translate article_text
            translation_content = translator.translate(content, dest='en')
            translation_title = translator.translate(doc.title, dest='en')

            # Put translated article into dataframe
            url_df.loc[len(url_df)] = [url, doc.title, translation_title.text, translation_content.origin, translation_content.text]
    
    # Return df with translated articles and list of error urls
    return(url_df, errors)

### Function trial

In [14]:
list_of_keywords = ["china", "bank"]
website = "elmundo.es"
language = "spanish"

url_df, errors = translate_func(list_of_keywords, website, language)

In [16]:
errors

['https://www.elmundo.es/mundodinero/2011/01/24/economia/1295861954.html',
 'https://www.elmundo.es/elmundo/2011/11/02/navegante/1320223792.html',
 'https://www.elmundo.es/mundodinero/2010/09/16/economia/1284628377.html']

In [17]:
url_df

Unnamed: 0,url,original_title,translated_title,original_text,translated_text
0,https://www.elmundo.es/economia/2016/02/17/56c...,"Así es ICBC, la institución financiera más gra...","This is ICBC, the largest financial institutio...","En los ochenta, el ICBC comenzó a expandirse p...","In the eighties, the ICBC began to expand abro..."
1,https://www.elmundo.es/economia/2016/02/17/56c...,"ICBC, el banco de la gran inversión china en E...","ICBC, the bank of the great Chinese investment...",Registran la sede de la entidad en Madrid ICBC...,They register the headquarters of the entity i...
2,https://www.elmundo.es/economia/2015/04/08/552...,"La mafia china, a por el Banco Madrid | Econom...","The Chinese Mafia, for Banco Madrid | Economy ...","El escándalo financiero de BPA La mafia china,...","The BPA financial scandal The Chinese Mafia, a..."
3,https://www.elmundo.es/economia/macroeconomia/...,"El Banco central chino reduce al 3,15% el tipo...",The Chinese Central Bank reduces the medium-te...,Abrir panel de nuevas noticias MACROECONOMÍA C...,Open new news panel COVID-19 MACROECONOMY From...
4,https://www.elmundo.es/cataluna/2018/10/27/5bd...,Carles Puigdemont pidió 11.000 millones a Chin...,Carles Puigdemont asked China for 11 billion f...,Carles Puigdemont pidió 11.000 millones a Chin...,"Carles Puigdemont asked China for 11,000 milli..."
5,https://www.elmundo.es/economia/2020/03/16/5e6...,La economía china reinicia su actividad tras u...,The Chinese economy restarts its activity afte...,La economía china reinicia su actividad tras u...,The Chinese economy restarts its activity afte...
6,https://www.elmundo.es/economia/2016/02/25/56c...,Golpe a la telaraña delictiva china | Economía...,Strike to the Chinese criminal cobweb | Econom...,Operación Sombra Golpe a la telaraña delictiva...,Operation Shadow Hit the Chinese criminal web ...


In [18]:
url_df.to_csv('translate_function_output.csv')