In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
import re
from nltk.corpus import stopwords

from nltk.util import ngrams

# Visualization     
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# do not print warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("job_offers_original_2.csv")

In [3]:
#drop all columns and keeps only job_title and job_description
df = df[['job_title','company_name', 'job_description']]

In [4]:
#(?<![A-Z\W])  what precedes is a word character EXCEPT for capital letters
#(?=[A-Z])     and what follows is a capital letter
def sepa(text): 
    text = re.sub(r'(?<![A-Z\W])(?=[A-Z])', ' ', text)
    return(text)

df['job_description']=df['job_description'].apply(sepa)

In [5]:
df["job_description"] = df["job_description"].str.lower() #lowercase
df['job_description'] = df['job_description'].str.replace('\d+', '') # remove digits

In [6]:
df['job_description'] = df['job_description'].str.replace('/', ',')

In [6]:
df['job_description'][0]

" do you want to work on the most pressing problem of our generation?we're building the infrastructure for the net zero transition, and we're looking for brilliant engineers, designers, and data scientists who want to help define a low carbon future.decarbonizing the economy requires a granular, real-time view of where emissions come from and how they might be reduced. we build software to automate the carbon footprinting of supply chains. banks, traders, and manufacturers use our product to tame the complexity of international supply networks, identify the most carbon-intensive parts, and find greener alternatives. we were part of the y combinator summer  batch and have secured backing from the uk government's innovation arm, innovate uk, the nat west accelerator and the london business school incubator.to join carbon chain, you'll be a keen technologist who loves to learn from others. our company is made up of  passionate people with expertise ranging from oil refining to deep learni

In [7]:
#some common words that lead to the skills are experience, you'll have, responsible, are looking for, ability to,
#knowledge of, understanding of
import re
import pandas as pd
import spacy
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher

In [8]:
from spacy import displacy
from IPython.display import HTML, display

In [9]:
#Extract the ads into a list
desc=list(df.job_description)

In [10]:
#Initialise Spacy model
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
def highlight_terms(terms, texts):
    for doc in nlp.pipe(texts):
        for sentence in set([tok.sent for tok in doc if tok.lower_ in terms]):
            text = sentence.text.strip() # break docs into sentence
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))
            print('-----')

In [12]:
highlight_terms(['ability'], desc[:10])

-----


-----


-----


-----


-----


-----


-----


In [13]:
matcher = Matcher(nlp.vocab)
pattern = [{'LOWER': 'ability'},{'LOWER':'to'},{"POS": {"IN": ['ADJ',"AUX", "VERB", "NOUN",'ADP','ADV']}, "OP": "*"},
          ]
matcher.add('ability_adp', [pattern])

In [14]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in nlp.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'ability':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'ABILITY': 'lightgreen'}})
                

In [15]:
show_extraction(desc[:10], matcher)

In [16]:
def get_extractions(examples, *extractors):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end

In [17]:
list(get_extractions(desc[:10], matcher))

[('ability to make real changes with tangible business value',
  0,
  415,
  424,
  'ability_adp',
  376,
  432),
 ('ability to build close relationships with',
  0,
  547,
  553,
  'ability_adp',
  470,
  903),
 ('ability to be adaptable', 3, 145, 149, 'ability_adp', 112, 173),
 ('ability to maintain high', 4, 569, 573, 'ability_adp', 390, 662),
 ('ability to interact with databases', 4, 590, 595, 'ability_adp', 390, 662),
 ('ability to clearly translate numbers into meaningful',
  4,
  612,
  619,
  'ability_adp',
  390,
  662),
 ('ability to manage multiple tasks in', 5, 543, 549, 'ability_adp', 244, 750),
 ('ability to demonstrate', 5, 570, 573, 'ability_adp', 244, 750),
 ('ability to partner effectively with people of varying degrees of technical capability desirable experience building underlying data pipelines',
  5,
  604,
  622,
  'ability_adp',
  244,
  750),
 ('ability to quickly learn', 6, 431, 435, 'ability_adp', 235, 442),
 ('ability to write complex queries data warehous

In [18]:
#Put it in a dataframe and join with the job metadata
def extract_df(*extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df)
    ent_df = pd.DataFrame(list(get_extractions(df[:n_max].job_description, *extractors)),
                          columns=['text', 'docidx', 'start', 'end', 'label', 'sent_start', 'sent_end'])
    return ent_df.merge(df, how='left', left_on='docidx', right_index=True)

In [19]:
%time 
ent_df = extract_df(matcher, n_max=1000)
ent_df.head()

CPU times: user 4 µs, sys: 4 µs, total: 8 µs
Wall time: 11 µs


Unnamed: 0,text,docidx,start,end,label,sent_start,sent_end,job_title,company_name,job_description
0,ability to make real changes with tangible bus...,0,415,424,ability_adp,376,432,Data Engineer,CarbonChain,do you want to work on the most pressing prob...
1,ability to build close relationships with,0,547,553,ability_adp,470,903,Data Engineer,CarbonChain,do you want to work on the most pressing prob...
2,ability to be adaptable,3,145,149,ability_adp,112,173,Data Engineer,The Data Shed,"description at the data shed, we've been work..."
3,ability to maintain high,4,569,573,ability_adp,390,662,Data Engineer,Oxbotica,are you a data engineer interested in or curr...
4,ability to interact with databases,4,590,595,ability_adp,390,662,Data Engineer,Oxbotica,are you a data engineer interested in or curr...


In [20]:
#Aggregate the counts of different texts.

#It's more significant if it happens accross multiple Advertisers/Sources.

def aggregate_df(df, col=['text']):
    return (df
            .groupby(col)
            .agg(n_company=('company_name', 'nunique'),
                 n=('job_title', 'count'))
            .reset_index()
            .sort_values(['n_company','n'], ascending=False)
        )

In [20]:
pd.set_option("display.max_colwidth", -1)

In [21]:
aggregate_df(ent_df).head(10)

Unnamed: 0,text,n_company,n
209,ability to understand,5,26
230,ability to work in,5,18
109,ability to identify,4,17
139,ability to manage,4,14
126,ability to lead,4,6
79,ability to drive,3,26
35,ability to collect,3,16
170,ability to process,3,13
218,ability to work,3,9
240,ability to work under pressure,3,8


In [22]:
def showent(docidx, start, end, label, sent_start, sent_end, **kwargs):
    # We don't need to parse it, so just make_doc
    doc = nlp.make_doc(desc[docidx])
    doc.ents = [Span(doc, start, end, label)]
    sent = doc[sent_start:sent_end]
    displacy.render(sent, style='ent')
    
def showent_df(df):
    for idx, row in df.iterrows():
        showent(**row)

In [23]:
ent_df['text'] = ent_df['text'].str.replace('ability to', '')

In [24]:
aggregate_df(ent_df)

Unnamed: 0,text,n_company,n
209,understand,5,26
230,work in,5,18
109,identify,4,17
139,manage,4,14
126,lead,4,6
...,...,...,...
234,work on challenging issues,1,1
235,work productively,1,1
236,work remotely abroad for up to,1,1
238,work through,1,1


In [25]:
df_ent_agg = aggregate_df(ent_df)
len(df_ent_agg)

251

In [26]:
skills=df_ent_agg['text'].tolist()

In [27]:
with open('skills_1.txt', 'w') as f:
    for skill in skills:
        print(skill, file=f)

In [28]:
n_max=10000
for a,b,c in zip(skills[:n_max:3],skills[1:n_max:3],skills[2:n_max:3]):
     print('{:<35}{:<35}{:<}'.format(a,b,c))

 understand                         work in                            identify
 manage                             lead                               drive
 collect                            process                            work
 work under pressure                interpret unstructured data        think beyond raw data
 discuss                            establish                          operate within multidisciplinary diverse teams
 understand complex problems        work autonomously in               engage
 deal with                          engage with                        think in
 thrive in                          work independently                 work with
 influence                          operate with                       think strategically about business
 build well                         deploy machine learning models on cloud platforms such as aws write quality unit tests demonstrate
 deliver accurate                   design                             get i

In [29]:
read_file = pd.read_csv (r'skills_1.txt')
read_file.to_csv (r'skills_1.csv', index=None)