This script does preliminary cleaning of jobs and isco data.

Additionally, it performs the language detection and translation of all job description data from original language to english 

In [1]:
import nltk
import pandas as pd
from lingua import Language, LanguageDetectorBuilder
from deep_translator import GoogleTranslator
from tqdm.notebook import tqdm
import re
import time

In [2]:
# Set file paths
jobs_input_path = '../data/wi_dataset.csv' # Original data
isco_input_path = '../data/wi_labels.csv' # Original data

jobs_output_path = '../output/jobs_ts.csv' # Need to create an outputs folder
isco_output_path = '../output/wi_labels_cleaned.csv'

In [3]:
# Load data
jobs = pd.read_csv(jobs_input_path)
isco = pd.read_csv(isco_input_path, dtype=str)

Data processing for ISCO code and description dataset

In [4]:
isco['description'] = isco['description'].str.lower()

# These additional notes may end up saying how another classification would be better, and may confuse the embedding model,
# so to remove them. "notes" are always after "some related occupations...", so remove notes first, just to be sure
isco['description'] = (isco['description']
                       .str.replace(r'(notes\n.*)', '', regex=True)
                       .str.replace(r'(some related occupations classified elsewhere.*)', '', regex=True))

Data processing for job description dataset

In [5]:
# Cleaning job descriptions
jobs['description_clean'] = jobs['description'].fillna('') # Fill missing descriptions

# Remove all identified special characters from job description and title before language detection and translation
special_characters = r'[$?^+!@#*_★😊�№→↗™⇧∂⇨∙√⏬−⏰└│■▪▬▶▸►▼◆●◢◥◾☀☛♀♦⚓⚽✅✆✉✍✓✔✩✴✶❖❤❯➠➡➢➤➧➭➲➽⠀⦁⫽⬟⭐¤¦§¨©ª«¬®°²³´µ·º»¿‡„\ufeff]'
jobs['description_clean'] = jobs['description_clean'].str.replace(special_characters, '', regex = True)
jobs['title_clean'] =  jobs['title'].str.replace(special_characters, '', regex = True)


Language detection

In [6]:
# lingua covers all EU languages, hence can use it for detection of languages in this dataset. 
# Loading languages to be used for detection
languages = [Language.BULGARIAN, Language.CROATIAN, Language.CZECH, Language.DANISH, Language.DUTCH, Language.ENGLISH, Language.ESTONIAN,
             Language.FINNISH, Language.FRENCH, Language.GERMAN, Language.GREEK, Language.HUNGARIAN, Language.IRISH, Language.ITALIAN,
             Language.LATVIAN, Language.LITHUANIAN, Language.POLISH, Language.PORTUGUESE, Language.ROMANIAN, Language.SLOVAK, 
             Language.SLOVENE, Language.SPANISH, Language.SWEDISH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()


In [6]:
# Detecting language of job title and description. 
jobs['lang_desc'] = [detector.detect_language_of(description) for description in jobs['description_clean']]
jobs['lang_desc'] = jobs['lang_desc'].astype(str).str.replace('Language.', '') # So that the output fits the translation step better

jobs['lang_title'] = [detector.detect_language_of(title) for title in jobs['title_clean']]
jobs['lang_title'] = jobs['lang_title'].astype(str).str.replace('Language.', '')

In [7]:
# Conclude on a final language: Priority for language based on description first since it has more text.
# Furthermore, it's not the job title that is being classified, but the job description.
# Lastly, sometimes the job titles are in English, but the desciption is in host country's language
# Sometimes the language of the description cannot be detected, returning None.
jobs.loc[(jobs['lang_desc'] == "None"), 'lang_desc'] = None 
jobs['lang'] = jobs['lang_desc'].combine_first(jobs['lang_title']) # Fill in empty lang_desc with lang_title

Translation

In [8]:
# Some preprocessing for translation step: tokenise into sentences before translating --> less chances of getting a translation error
# from sending too much text to google API.
jobs['title_and_desc'] = jobs['title_clean'] + ". " + jobs['description_clean']
jobs['td_sent_tok'] = [nltk.tokenize.sent_tokenize(td) for td in jobs['title_and_desc']] # Should work for most languages
jobs['lang'] = jobs['lang'].str.replace('SLOVENE', 'SLOVENIAN').str.lower() # To match what is required of translator package


In [None]:
# translation from original JD language to English. This phase took around 3-6 hrs
tqdm.pandas(desc='translation in progress')

jobs_ts = jobs.copy()
lang_map = GoogleTranslator().get_supported_languages(as_dict=True) 

jobs_ts['lang_sf'] = jobs_ts['lang'].map(lang_map) # So can get the language codes used by google translate, which is different from that used by lingua

def gs_translate_desc(row):
    # No need to translate if the detected language is english
    if (row['lang_sf'] == "en"):
        return row['td_sent_tok']
    else:
        # Send tokenised sentences to google translate in batches. 
        # Input JD language is as per determined by Lingua package. Output should always be english
        try:
            return GoogleTranslator(source=row['lang_sf'], target='en').translate_batch(row['td_sent_tok'])
        
        # Possibly translation failed due to poor internet connection (either fail to send, or fail to receive from Google translate API
        except Exception as e:
            print("Failed to translate. Error: ", e)
            return ['Failed to translate']
    
jobs_ts['title_desc_ts'] = jobs_ts.progress_apply(gs_translate_desc, axis=1) # Progress apply is for tqdm, to track progress of translation 

In [None]:
# Translate for any translations that didn't go through. Occurs due to unstable internet connection, over the 3-6 hrs that the
# previous code snippets run for. Keep running this part until all transla
failed_to_ts_bool = jobs_ts['title_desc_ts'].isin([['Failed to translate']])
while sum(failed_to_ts_bool) > 0:
    # Apply the translation function to only those rows which failed previously
    jobs_ts.loc[failed_to_ts_bool, "title_desc_ts"] = jobs_ts.loc[failed_to_ts_bool, :].progress_apply(gs_translate_desc, axis=1) 
    failed_to_ts_bool = jobs_ts['title_desc_ts'].isin([['Failed to translate']]) # Redetermine the failures
    print("Number of failed translations: ", sum(failed_to_ts_bool))
    time.sleep(5) # Wait 5 seconds before trying again
    
print("No failed translations, please proceed")

In [11]:
# concat all the translated sentences into one JD, 
# except the "None"s (those which were not able to be translated, likely a sequence of symbols e.g. ----------------)
def remove_none_concat(title_desc_ts):
    return ' '.join([sent for sent in title_desc_ts if sent != None])

jobs_ts['title_desc_ts_clean'] = jobs_ts['title_desc_ts'].apply(remove_none_concat)

In [12]:
# further cleaning remove non-ASCII since main language is English now
jobs_ts['title_desc_ts_postclean'] = [re.sub(r'[^\x00-\x7f]', '', x).replace(" ", " ") for x in jobs_ts['title_desc_ts_clean']]
jobs_ts['title_desc_ts_postclean'] = jobs_ts['title_desc_ts_postclean'].str.lower()

In [16]:
# Output
jobs_ts.drop(columns=['title_and_desc', 'td_sent_tok',
                      'lang_sf', 'title_desc_ts']).to_csv(jobs_output_path, index=False)

isco.to_csv(isco_output_path, index=False)