In [1]:
import tagui as r
import datetime as dt
import numpy as np

In [2]:
# constants

## sample website to launch
website = r'https://www.linkedin.com/insights/report/talent/MjE4NTM5NnwxNDExMTgyNTd8MTE5NTgwMDk4fDc4MDg2MjF8SA==/overview'

## combinations - location, skills, function
LocationList = 'Singapore,China,India'.split(',')
FunctionList = 'All,Finance,Information Technology,Human Resources'.split(',')
SkillList = 'All,Aviation,Food & Beverage,Air Freight,Industrial Safety'.split(',')

# # smaller subset for testing
# LocationList = 'China'.split(',')
# FunctionList = 'All,Finance,Information Technology'.split(',')
# SkillList = 'All,Aviation,Food & Beverage,Air Freight'.split(',')


## containers
main_list = []
temp_list = []
error_group = []

## today
today = dt.date.today().strftime("%Y-%m-%d")


#### Helper Functions
-----------------------------


In [3]:
def expand_search_filter():
    # Click expand filter
    r.click('Search filters expand')


In [4]:
def clear_filters():
    
    r.wait(2)
        
    # Clear location filter
    c1 = '//*[contains(@aria-label, "Clear Location facet list")]'
    if r.exist(c1):
        r.click(c1)

    # Clear Skill filter
    c2 = '//*[contains(@aria-label, "Clear Skill facet list")]'
    if r.exist(c2):
        r.click(c2)

    # Clear Function filter
    c3 = '//*[contains(@aria-label, "Clear Function facet list")]'
    if r.exist(c3):
        r.click(c3)



In [5]:
def apply_filter(loc, skill, func, wait=1):
    if func.upper() != "ALL":
        # Apply filter - Function
        r.click('//*[contains(@aria-label, "Add Function filter")]')
        r.wait(wait)
        
        r.type('//*[contains(@data-query-type, "FUNCTION")]//*[contains(@for, "artdeco-pill__input")]', func)
        r.wait(wait)
        
        r.click('//*[@class="typeahead-results"]/div/li[1]')
        r.wait(wait)
    
    if skill.upper() != "ALL":
        # Apply filter - Skill
        r.click('//*[contains(@aria-label, "Add Skill filter")]')
        r.wait(wait)
        
        r.type('//*[contains(@data-query-type, "SKILL")]//*[contains(@for, "artdeco-pill__input")]', skill)
        r.wait(wait)
        
        r.click('//*[@class="typeahead-results"]/div/li[1]')
        r.wait(wait)
    
    if loc.upper() != "ALL":
        # Apply filter - Location
        r.click('//*[contains(@aria-label, "Add Location filter")]')
        r.wait(wait)
        
        r.type('//*[contains(@data-query-type, "LOCATION")]//*[contains(@for, "artdeco-pill__input")]', loc)
        r.wait(wait)
        
        r.click('//*[@class="typeahead-results"]/div/li[1]')
        r.wait(wait)

        
    # Click on the Apply button
    r.click("Apply")
    r.wait(wait)
    
    # Close the filter panel if it is still on after clicking Apply button
    if r.exist('/html/body/div[3]/div[8]/div[1]/div/div/div/div/div/search-filters-panel-content/div[1]/header/button/li-icon/svg'):
        r.click('/html/body/div[3]/div[8]/div[1]/div/div/div/div/div/search-filters-panel-content/div[1]/header/button/li-icon/svg')


In [6]:
def extract_data():
    # Extract data
    
    check_label = r.read('/html/body/div[3]/div[8]/div[1]/section/header/div[1]/section').split('\n')
    check_label = '|'.join([i.strip() for i in check_label if i[-1] != ' '])
    
    
    if r.exist('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/ul/li[1]/span[1]/button/div[1]'):
        professionals = r.read('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/ul/li[1]/span[1]/button/div[1]')
        changed_jobs = r.read('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/ul/li[2]/span/button/div[1]')
        job_posts = r.read('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/ul/li[3]/span/button/div[1]')
        engaged_talent = r.read('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/ul/li[4]/span/button/div[1]')
    else:
        professionals = ''
        changed_jobs = ''
        job_posts = ''
        engaged_talent = ''
            
    
    if r.exist('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/section[5]/div[2]/div/div[2]/dl/dd[2]/span/strong'):
        view_to_apply_rate = r.read('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/section[5]/div[2]/div/div[2]/dl/dd[2]/span/strong')
    else:
        view_to_apply_rate = '-'
    
    if r.exist('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/section[5]/div[2]/div/div[2]/dl/dd[3]/span/strong'):
        inmail_response_rate = r.read('/html/body/div[3]/div[8]/div[1]/section/div[1]/main/div/div/section[5]/div[2]/div/div[2]/dl/dd[3]/span/strong')
    else:
        inmail_response_rate = '-'
    
    # Remove thousandths comma
    professionals = professionals.replace(',', '')
    changed_jobs = changed_jobs.replace(',', '')
    job_posts = job_posts.replace(',', '')
    engaged_talent = engaged_talent.replace(',', '')
        
    return professionals, changed_jobs, job_posts, engaged_talent, view_to_apply_rate, inmail_response_rate, check_label.strip()

In [7]:
def error_check(data_from_web, loc, skill, func, counter):
    # error checks
    error_flag = 0

    # label checks
    label1 = '|'.join([i.lower() for i in [loc, skill, func] if i.upper() != "ALL"])

    if label1 != data_from_web[-1].lower():
        print(f'{counter} **Mismatch** {loc}-{skill}-{func}.')
        error_flag = 1

    # empty checks
    test_data = [0 if i != '' else 1 for i in data_from_web]

    if sum(test_data) > 0:
        print(f'{counter} **Missing Data** {loc}-{skill}-{func}.')
        error_flag = 1
        
    return error_flag

In [8]:
def get_data_by_location(loc_list, skill_list, function_list):
    subgroup = []
    error_group = []
    counter = 1
    
    for loc in loc_list:
        for skill in skill_list:
            for func in function_list:

                expand_search_filter()
                clear_filters()
                apply_filter(loc, skill, func)

                if r.exist('//*[contains(@aria-label, "Dismiss")]'):
                    r.click('//*[contains(@aria-label, "Dismiss")]')

                data_from_web = extract_data()

                # error checks
                error_flag =  error_check(data_from_web, loc, skill, func, counter)
                
                if error_flag == 1:
                    error_group.append((loc, skill, func))
                else:
                    subgroup.append([loc, skill, func] + [i for i in data_from_web])
                    print(counter, subgroup[-1])

                counter+=1

            
    print("** End **")
    
    return subgroup, error_group
    

In [9]:
def rectify_process(all_errors, rectified = []):
    counter = 0
    start_with = len(all_errors)
    
    for i in all_errors:
        loc, skill, func = i
        data1, error1 = get_data_by_location([loc], [skill], [func])
        
        all_errors = all_errors[1:]
        
        if len(data1) > 0:
            rectified.append(data1[0])
        else:
            all_errors.append((loc, skill, func))
        
        counter += 1
        if counter >= 2*start_with + 3:
            print_status("** PERSISTENT ERROR ** ")
            print(all_errors)
            r.telegram(975703526, f'Persistent Error: {all_errors}')
            break
    
    return rectified

In [10]:
# Print status
def print_status(msg):
    dtt = dt.datetime.today().strftime('%Y-%m-%d %H:%M')
    print(f'[{dtt}]: {msg}')

#### Operation
--------------------------


In [11]:
print_status('** Begin First Run **')

[2022-02-02 11:51]: ** Begin First Run **


In [12]:

r.init(turbo_mode = True)

True

In [13]:
r.url(website)

True

In [14]:
main_list, all_errors = get_data_by_location(LocationList, SkillList, FunctionList)

1 ['Singapore', 'All', 'All', '2235124', '179959', '119067', '84004', '17%', '33%', 'Singapore']
2 ['Singapore', 'All', 'Finance', '89752', '10074', '5499', '4871', '19%', '48%', 'Singapore|Finance']
3 ['Singapore', 'All', 'Information Technology', '96486', '11022', '10321', '5162', '17%', '31%', 'Singapore|Information Technology']
4 ['Singapore', 'All', 'Human Resources', '51855', '8143', '4716', '3692', '19%', '33%', 'Singapore|Human Resources']
5 ['Singapore', 'Aviation', 'All', '22619', '2938', '231', '4390', '16%', '55%', 'Singapore|Aviation']
6 ['Singapore', 'Aviation', 'Finance', '776', '134', '1', '134', '20%', '100%', 'Singapore|Aviation|Finance']
7 ['Singapore', 'Aviation', 'Information Technology', '835', '151', '--', '185', '12%', '100%', 'Singapore|Aviation|Information Technology']
8 ['Singapore', 'Aviation', 'Human Resources', '551', '118', '2', '141', '18%', '—', 'Singapore|Aviation|Human Resources']
9 ['Singapore', 'Food & Beverage', 'All', '44328', '6819', '4356', '559

In [15]:
all_errors

[('Singapore', 'Industrial Safety', 'All'),
 ('China', 'Air Freight', 'Human Resources'),
 ('India', 'Aviation', 'Human Resources'),
 ('India', 'Food & Beverage', 'All'),
 ('India', 'Food & Beverage', 'Finance'),
 ('India', 'Food & Beverage', 'Information Technology'),
 ('India', 'Food & Beverage', 'Human Resources'),
 ('India', 'Air Freight', 'All'),
 ('India', 'Air Freight', 'Finance'),
 ('India', 'Air Freight', 'Information Technology'),
 ('India', 'Air Freight', 'Human Resources'),
 ('India', 'Industrial Safety', 'All'),
 ('India', 'Industrial Safety', 'Finance'),
 ('India', 'Industrial Safety', 'Information Technology'),
 ('India', 'Industrial Safety', 'Human Resources')]

#### Rectify
--------------

In [19]:
if len(all_errors) > 0:
    print_status('** Begin Second Run **')

    r.url(website)

    rectified = rectify_process(all_errors)
else:
    rectified = []

[2022-02-02 15:31]: ** Begin Second Run **
1 ['Singapore', 'Industrial Safety', 'All', '2096', '221', '79', '330', '26%', '50%', 'Singapore|Industrial Safety']
** End **
1 ['China', 'Air Freight', 'Human Resources', '32', '1', '--', '--', '—', '—', 'China|Air Freight|Human Resources']
** End **
1 ['India', 'Aviation', 'Human Resources', '3274', '738', '2', '70', '35%', '—', 'India|Aviation|Human Resources']
** End **
1 ['India', 'Food & Beverage', 'All', '190603', '26462', '2071', '2721', '51%', '—', 'India|Food & Beverage']
** End **
1 ['India', 'Food & Beverage', 'Finance', '1776', '325', '24', '27', '60%', '—', 'India|Food & Beverage|Finance']
** End **
1 ['India', 'Food & Beverage', 'Information Technology', '3363', '595', '3', '33', '51%', '—', 'India|Food & Beverage|Information Technology']
** End **
1 ['India', 'Food & Beverage', 'Human Resources', '3620', '1052', '2', '40', '75%', '—', 'India|Food & Beverage|Human Resources']
** End **
1 ['India', 'Air Freight', 'All', '11141',

#### Save As CSV
-------------------------


In [17]:
Save_Folder = r'C:\Users\cheehuat_huang\OneDrive - SATS Ltd\Analytics Share Folder\LinkedIn Talent Insights\Data'

In [20]:
main_list = main_list + rectified if len(rectified) > 0 else main_list

In [21]:
with open(f'XLTI_{today}.csv', 'w') as f:
    for i in main_list:
        i = [str(j) for j in i]
        f.write(','.join(i))
        f.write('\n')

In [None]:
r.close()

In [None]:
print_status("** Completed **")