In [37]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz # FuzzyWuzzy has a ratio function that calculates the standard Levenshtein distance similarity ratio between two sequences
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import os

In [2]:
msba_skills = pd.read_excel("Skill Keyword List.xlsx", sheet_name = "Sheet2", usecols="A:B")

In [3]:
msba_skills.tail()

Unnamed: 0,MSBA Keyword,Sub Keyword
29,programming,programming
30,microsoft suite,microsoft suite
31,learning,learning
32,travel,travel
33,enterprise software,enterprise software


In [4]:
intersection_set = "learning machine"
remainder_set_1 = "learning machine techniques"
remainder_set_2 = "and data demonstrated experience in science theory"

In [5]:
fuzz.ratio(intersection_set, remainder_set_1)

74

In [6]:
fuzz.ratio(intersection_set, remainder_set_2)

33

In [7]:
fuzz.ratio(remainder_set_1, remainder_set_2)

39

In [8]:
fuzz.token_set_ratio("machine learning techniques",
                     "demonstrated experience in data science and machine learning theory")

74

## Stack datasets

In [9]:
de_data = pd.read_csv("LinkedIn_Data Engineer.csv")
de_data['Title Category'] = "Data Engineering"

In [10]:
ds_data = pd.read_csv("LinkedIn_Data Scientist.csv")
ds_data['Title Category'] = "Data Science"

In [11]:
da_data = pd.read_csv("LinkedIn_Analyst.csv")
da_data['Title Category'] = "Data Analytics"

In [12]:
all_data = pd.concat([de_data, ds_data, da_data], axis=0, ignore_index=True)

In [13]:
all_data = all_data.drop(columns=['index', 'Link', 'Level', 'Type', 'Industry', 'Applicants'])

In [14]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2908 entries, 0 to 2907
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ID              2908 non-null   object
 1   Date            2908 non-null   object
 2   Company         2908 non-null   object
 3   Title           2908 non-null   object
 4   Location        2908 non-null   object
 5   Description     2908 non-null   object
 6   Function        2908 non-null   object
 7   Title Category  2908 non-null   object
dtypes: object(8)
memory usage: 181.9+ KB


## Identify most mentioned skills for **all** titles, and for each title

In [15]:
fuzz.token_set_ratio(all_data['Description'][0], msba_skills['Sub Keyword'][0])   # experience

100

In [16]:
def find_keyword(skill):
    keyword = skill
    # ds_data[keyword] = ds_data['Description'].apply(lambda x: fuzz.token_set_ratio(x, keyword))
    # ds_data[keyword + '_norm'] = np.where(ds_data[keyword] > 70, 1, 0)
    match_scores = all_data['Description'].apply(lambda x: fuzz.token_set_ratio(x, keyword))
    match_scores_norm = [1 if s > 70 else 0 for s in match_scores]
    match_scores = [s for s in match_scores]
    match_scores_col = pd.DataFrame(match_scores, columns=[keyword])
    match_scores_norm_col = pd.DataFrame(match_scores_norm, columns=[keyword+'_norm'])
    match_scores_data = pd.concat([match_scores_col, match_scores_norm_col], axis = 1)
    return match_scores_data

for skill in msba_skills['Sub Keyword']:
    match_scores_data=find_keyword(skill)
    all_data = pd.concat([all_data, match_scores_data], axis=1)
    
all_data.head()

Unnamed: 0,ID,Date,Company,Title,Location,Description,Function,Title Category,experience,experience_norm,...,programming,programming_norm,microsoft suite,microsoft suite_norm,learning,learning_norm,travel,travel_norm,enterprise software,enterprise software_norm
0,urn:li:jobPosting:3513563302,2023-03-08,Bloom Insurance,Data Engineer I,United States,To support internal and external clients via p...,Strategy/Planning and Information Technology,Data Engineering,100,1,...,100,1,75,1,1,0,1,0,59,0
1,urn:li:jobPosting:3522662547,2023-03-14,Intrepid,Data Engineer(Remote),United States,Intrepid sets the standard for delivering exce...,Strategy/Planning and Information Technology,Data Engineering,100,1,...,1,0,1,0,0,0,0,0,1,0
2,urn:li:jobPosting:3548517394,2023-03-29,Pomeroy,Data Engineer,"Cincinnati, OH",General Function: The Data Engineer will play ...,Information Technology,Data Engineering,100,1,...,1,0,2,0,1,0,1,0,69,0
3,urn:li:jobPosting:3531488187,2023-03-17,Nike,Data Engineer,"Beaverton, OR","Become a Part of the NIKE, Inc. Team NIKE, In...","Quality Assurance, Research, and Analyst",Data Engineering,100,1,...,100,1,1,0,1,0,1,0,59,0
4,urn:li:jobPosting:3530243358,2023-03-21,Miso Robotics,Data Engineer,"Pasadena, CA",Our Company We are a cutting-edge robotics...,Information Technology,Data Engineering,100,1,...,100,1,1,0,1,0,1,0,2,0


## Group Sub Keywords into MSBA Keywords

In [17]:
# Group Sub Keyword counts into MSBA Keyword counts

skills_group = all_data.copy().filter(regex='_norm')
skills_group_columns = [col.replace("_norm", "") for col in skills_group.columns]
skills_group.columns = skills_group_columns
skills_group.head()

Unnamed: 0,experience,leadership,execution,management,teamwork,team work,analytics,analytic,analytical,analysis,...,modeling,technical,interpersonal,professionalism,statistics,programming,microsoft suite,learning,travel,enterprise software
0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,1,0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [18]:
skills_group['teamwork_temp'] = np.where( (skills_group.loc[:, ('teamwork')] ==1) | (skills_group.loc[:, ('team work')] ==1), 1, 0)
print(len(skills_group.loc[skills_group['team work'] ==1]))
print(len(skills_group.loc[skills_group['teamwork'] ==1]))
print(len(skills_group.loc[skills_group['teamwork_temp'] ==1]))

1666
142
1734


In [19]:
skills_group['general_analytical_temp'] = np.where( (skills_group['analytics'] ==1) | (skills_group['analytic'] ==1) | (skills_group['analytical'] ==1) | (skills_group['analysis'] ==1), 1, 0)
print(len(skills_group.loc[skills_group['analytics'] ==1]))
print(len(skills_group.loc[skills_group['analytic'] ==1]))
print(len(skills_group.loc[skills_group['analytical'] ==1]))
print(len(skills_group.loc[skills_group['analysis'] ==1]))
print(len(skills_group.loc[skills_group['general_analytical_temp'] ==1]))

1413
498
982
1692
2326


In [20]:
skills_group['degree_certification_temp'] = np.where( (skills_group['degree'] ==1) | (skills_group['certification'] ==1), 1, 0)
print(len(skills_group.loc[skills_group['degree'] ==1]))
print(len(skills_group.loc[skills_group['certification'] ==1]))
print(len(skills_group.loc[skills_group['degree_certification_temp'] ==1]))

1693
255
1747


In [21]:
skills_group['problem_solving_temp'] = np.where( (skills_group['problem-solving'] ==1) | (skills_group['problem solving'] ==1), 1, 0)
print(len(skills_group.loc[skills_group['problem-solving'] ==1]))
print(len(skills_group.loc[skills_group['problem solving'] ==1]))
print(len(skills_group.loc[skills_group['problem_solving_temp'] ==1]))

614
614
614


In [22]:
skills_group['quantitative_knowledge_temp'] = np.where( (skills_group['quantitative knowledge'] ==1) | (skills_group['quantitative'] ==1), 1, 0)
print(len(skills_group.loc[skills_group['quantitative knowledge'] ==1]))
print(len(skills_group.loc[skills_group['quantitative'] ==1]))
print(len(skills_group.loc[skills_group['quantitative_knowledge_temp'] ==1]))

519
519
519


In [23]:
skills_group['customer_focus_temp'] = np.where( (skills_group['customer'] ==1) | (skills_group['customer focus'] ==1), 1, 0)
print(len(skills_group.loc[skills_group['customer'] ==1]))
print(len(skills_group.loc[skills_group['customer focus'] ==1]))
print(len(skills_group.loc[skills_group['customer_focus_temp'] ==1]))

878
878
878


In [24]:
sub_keywords_to_drop = ['teamwork', 'team work',
                        'analytics', 'analytic', 'analytical', 'analysis',
                        'degree', 'certification',
                        'problem solving', 'problem_solving_temp',
                        'quantitative', 'quantitative_knowledge_temp',
                        'customer', 'customer_focus_temp']

skills_group.drop(columns = sub_keywords_to_drop, inplace=True)
print(skills_group.columns)

Index(['experience', 'leadership', 'execution', 'management', 'project',
       'communication', 'work ability', 'problem-solving', 'big data',
       'change', 'quantitative knowledge', 'creative', 'customer focus',
       'modeling', 'technical', 'interpersonal', 'professionalism',
       'statistics', 'programming', 'microsoft suite', 'learning', 'travel',
       'enterprise software', 'teamwork_temp', 'general_analytical_temp',
       'degree_certification_temp'],
      dtype='object')


In [25]:
skills_group.rename(columns = {'teamwork_temp': 'teamwork',
                               'general_analytical_temp': 'general analytical',
                               'degree_certification_temp': 'degree & certification'}, inplace=True)
print(skills_group.columns)

Index(['experience', 'leadership', 'execution', 'management', 'project',
       'communication', 'work ability', 'problem-solving', 'big data',
       'change', 'quantitative knowledge', 'creative', 'customer focus',
       'modeling', 'technical', 'interpersonal', 'professionalism',
       'statistics', 'programming', 'microsoft suite', 'learning', 'travel',
       'enterprise software', 'teamwork', 'general analytical',
       'degree & certification'],
      dtype='object')


## Skills mentioned for all titles

In [26]:
skills_tally = skills_group.sum(axis=0)
skills_tally_df = skills_tally.to_frame(name='frequency')
skills_tally_df['percentage'] = skills_tally_df['frequency'] / all_data.shape[0]

In [27]:
pd.options.display.max_rows = 1000

skills_tally_df.sort_values(by='frequency', ascending=False)

Unnamed: 0,frequency,percentage
experience,2732,0.939477
general analytical,2326,0.799862
work ability,1899,0.653026
degree & certification,1747,0.600757
teamwork,1734,0.596286
management,1540,0.529574
technical,1527,0.525103
communication,1392,0.47868
learning,1036,0.356259
project,899,0.309147


## Skills mentioned for DS, DE, DA titles

In [28]:
all_data_2 = all_data[['ID', 'Location', 'Function', 'Title Category']]
print(len(all_data_2))
print(len(skills_group))

2908
2908


In [29]:
all_data_2 = pd.concat([all_data_2, skills_group], axis=1)
all_data_2.head()

Unnamed: 0,ID,Location,Function,Title Category,experience,leadership,execution,management,project,communication,...,professionalism,statistics,programming,microsoft suite,learning,travel,enterprise software,teamwork,general analytical,degree & certification
0,urn:li:jobPosting:3513563302,United States,Strategy/Planning and Information Technology,Data Engineering,1,0,0,1,0,1,...,0,0,1,1,0,0,0,1,0,0
1,urn:li:jobPosting:3522662547,United States,Strategy/Planning and Information Technology,Data Engineering,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,urn:li:jobPosting:3548517394,"Cincinnati, OH",Information Technology,Data Engineering,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,urn:li:jobPosting:3531488187,"Beaverton, OR","Quality Assurance, Research, and Analyst",Data Engineering,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,1
4,urn:li:jobPosting:3530243358,"Pasadena, CA",Information Technology,Data Engineering,1,0,1,1,0,1,...,0,0,1,0,0,0,0,1,1,1


In [30]:
skills_tally_by_title_df = all_data_2.groupby(['Title Category']).sum()
skills_tally_by_title_df

Unnamed: 0_level_0,experience,leadership,execution,management,project,communication,work ability,problem-solving,big data,change,...,professionalism,statistics,programming,microsoft suite,learning,travel,enterprise software,teamwork,general analytical,degree & certification
Title Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Data Analytics,914,220,104,665,396,561,646,282,87,198,...,10,224,165,294,170,122,65,638,836,665
Data Engineering,945,93,42,472,280,459,671,169,330,103,...,6,143,273,117,272,59,196,603,679,526
Data Science,873,162,74,403,223,372,582,163,211,87,...,6,477,371,113,594,130,77,493,811,556


In [31]:
# Data Analytics

skills_tally_da_df = skills_tally_by_title_df.filter(items = ['Data Analytics'], axis=0).transpose()
skills_tally_da_df.rename(columns = {'Data Analytics': 'frequency'}, inplace=True)
skills_tally_da_df['percentage'] = skills_tally_da_df['frequency'] / da_data.shape[0]
skills_tally_da_df.sort_values(by='frequency', ascending=False)

Title Category,frequency,percentage
experience,914,0.923232
general analytical,836,0.844444
management,665,0.671717
degree & certification,665,0.671717
work ability,646,0.652525
teamwork,638,0.644444
communication,561,0.566667
technical,466,0.470707
project,396,0.4
customer focus,305,0.308081


In [32]:
# Data Science

skills_tally_ds_df = skills_tally_by_title_df.filter(items = ['Data Science'], axis=0).transpose()
skills_tally_ds_df.rename(columns = {'Data Science': 'frequency'}, inplace=True)
skills_tally_ds_df['percentage'] = skills_tally_da_df['frequency'] / ds_data.shape[0]
skills_tally_ds_df.sort_values(by='frequency', ascending=False)

Title Category,frequency,percentage
experience,873,0.984914
general analytical,811,0.900862
learning,594,0.18319
work ability,582,0.696121
degree & certification,556,0.716595
teamwork,493,0.6875
statistics,477,0.241379
technical,444,0.502155
management,403,0.716595
modeling,381,0.171336


In [33]:
# Data Engineering

skills_tally_de_df = skills_tally_by_title_df.filter(items = ['Data Engineering'], axis=0).transpose()
skills_tally_de_df.rename(columns = {'Data Engineering': 'frequency'}, inplace=True)
skills_tally_de_df['percentage'] = skills_tally_da_df['frequency'] / de_data.shape[0]
skills_tally_de_df.sort_values(by='frequency', ascending=False)

Title Category,frequency,percentage
experience,945,0.923232
general analytical,679,0.844444
work ability,671,0.652525
technical,617,0.470707
teamwork,603,0.644444
degree & certification,526,0.671717
management,472,0.671717
communication,459,0.566667
big data,330,0.087879
project,280,0.4


In [42]:
skills_tally_df.drop(columns = 'frequency', inplace=True)

In [45]:
skills_tally_df.reset_index(inplace=True)
skills_tally_df.rename(columns = {'index': 'Title Category', 'percentage': 'percentage_all'}, inplace=True)
skills_tally_df.head()

Unnamed: 0,Title Category,percentage_all
0,experience,0.939477
1,leadership,0.163343
2,execution,0.075653
3,management,0.529574
4,project,0.309147


In [46]:
skills_tally_ds_df.reset_index(inplace=True)
skills_tally_ds_df.drop(columns = 'frequency', inplace=True)
skills_tally_ds_df.rename(columns = {'index': 'Title Category', 'percentage': 'percentage_ds'}, inplace=True)

In [51]:
skills_tally_ds_df.rename_axis(None, axis=1, inplace=True)
skills_tally_ds_df.head()

Unnamed: 0,Title Category,percentage_ds
0,experience,0.984914
1,leadership,0.237069
2,execution,0.112069
3,management,0.716595
4,project,0.426724


In [52]:
skills_tally_de_df.reset_index(inplace=True)
skills_tally_de_df.rename_axis(None, axis=1, inplace=True)
skills_tally_de_df.drop(columns = 'frequency', inplace=True)

In [53]:
skills_tally_de_df.rename(columns = {'index': 'Title Category', 'percentage': 'percentage_de'}, inplace=True)
skills_tally_de_df.head()

Unnamed: 0,Title Category,percentage_de
0,experience,0.923232
1,leadership,0.222222
2,execution,0.105051
3,management,0.671717
4,project,0.4


In [54]:
skills_tally_da_df.reset_index(inplace=True)
skills_tally_da_df.rename_axis(None, axis=1, inplace=True)
skills_tally_da_df.drop(columns = 'frequency', inplace=True)
skills_tally_da_df.rename(columns = {'index': 'Title Category', 'percentage': 'percentage_da'}, inplace=True)
skills_tally_da_df.head()

Unnamed: 0,Title Category,percentage_da
0,experience,0.923232
1,leadership,0.222222
2,execution,0.105051
3,management,0.671717
4,project,0.4


In [55]:
skills_merge = skills_tally_df.merge(skills_tally_da_df,
                                     on = 'Title Category',
                                     how = 'outer').merge(skills_tally_de_df,
                                                          on = 'Title Category',
                                                          how = 'outer').merge(skills_tally_ds_df,
                                                                               on ='Title Category',
                                                                               how = 'outer')
skills_merge

Unnamed: 0,Title Category,percentage_all,percentage_da,percentage_de,percentage_ds
0,experience,0.939477,0.923232,0.923232,0.984914
1,leadership,0.163343,0.222222,0.222222,0.237069
2,execution,0.075653,0.105051,0.105051,0.112069
3,management,0.529574,0.671717,0.671717,0.716595
4,project,0.309147,0.4,0.4,0.426724
5,communication,0.47868,0.566667,0.566667,0.604526
6,work ability,0.653026,0.652525,0.652525,0.696121
7,problem-solving,0.211142,0.284848,0.284848,0.303879
8,big data,0.215956,0.087879,0.087879,0.09375
9,change,0.133425,0.2,0.2,0.213362


In [60]:
skills_merge['Title Category'] = skills_merge['Title Category'].str.title()

In [39]:
CHARTS_FOLDER = '/Users/yinirong/UCLA_MSBA/Extracurricular/Data Science Job Market Project/Charts/'

In [61]:
# Output to Excel

# skills_merge.to_excel(os.path.join(CHARTS_FOLDER, 'MSBA Skills.xlsx'), index=False)