In [150]:
# NLP 
# loading the language model instance
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x7fee024dd3d0>

In [151]:
# constructing a doc object 
# a doc object is a sequence of Token objects representing lexical token (wtf is a lexical token)
# each token object has information about a particular piece of text

intro_doc = nlp("This tutorial is about Natural Language Processing in spaCy.")

type(intro_doc)

[token.text for token in intro_doc]

# we called the .text attribute to get the text containsed within that token

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.']

In [152]:
# Sentance Detection 
# the process of locating where sentances start and end in a given text
# this allows you to divide a text into lingustically meaninful text 

# in spaCy the .sents property is used to extract setances from the Doc objects. 

about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_doc = nlp(about_text)
sentances = list(about_doc.sents)
len(sentances)

for sentance in sentances:
    print(f"{sentance[:5]}...") # 5 indicates 4 words + ...

# with .sents you get a list of Span objects representing individual sentances. 

Gus Proto is a Python...
He is interested in learning...


In [153]:
# Tokens in spaCy 
# building the doc container involves tokenizing the text
# tokenisation breaks a text down into basic units (tokens)

for token in about_doc:
    print(token, token.idx)

# idx attribute represents the starting position of the token in the original text

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [154]:
print(
    f"{'Text with Whitespace':22}"
    f"{'Is Alphanumeric?':15}"
    f"{'Is Punctuation?':18}"
    f"{'Is Stop Word?'}"
)

for token in about_doc:
    print(
        f"{str(token.text_with_ws):22}"
        f"{str(token.is_alpha):15}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
    )

# .text_with_ws prints the token along with any trailing space
# .is_alpha is alphabetic characters or not 
# .is_punct is punctuation symbol or not 
# .is_stop is stopword or not

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                  

In [155]:
# Stop Words
# in NLP stop words are generally removed because they arent significant 

spacy_stop_words = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stop_words)

for stop_word in list(spacy_stop_words)[:10]:
    print(stop_word)

those
move
front
our
your
seem
sixty
somehow
would
become


In [156]:
nlp = spacy.load("en_core_web_sm")
about_doc = nlp(about_text)

# print a token in about_doc if it is not a stop word
print([token for token in about_doc if not token.is_stop])

[Gus, Proto, Python, developer, currently, working, London, -, based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


In [157]:
# Lemmatisation 
# the process of reducing inflected forms of a word white ensuring that the reduce form belongs to the language
# the reduce form (root word) is called a lemma 

lemma_text = (
    "Gus is helping organise a developer"
    " conference on Applications of Natural Language"
    " Processing. He keeps organising local Python meetups"
    " and several internal talks at his workplace."
)

lemma_doc = nlp(lemma_text)

for token in lemma_doc:
    if str(token) != str(token.lemma_):
        print(token, token.lemma_)


# this can be useful for the JobMatcher because if i have used a word in a different tense than the description .lemma_ will allow me to view the similarity.


is be
helping help
He he
keeps keep
organising organise
meetups meetup
talks talk


In [158]:
# Word Frequency 

from collections import Counter

complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)

complete_doc = nlp(complete_text)

words = [
    token.text
    for token in complete_doc
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common(5))


[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


In [159]:
# (PoS) Part-of-Speech Tagging 
# grammatical role that explains how a particular words is used in a sentance
# Noun
# Pronoun 
# Adjective
# Verb
# Adverb
# Preposition
# Conjunction
# Interjection

for token in about_doc:
    print(
        f"""
TOKEN: {str(token)}
TAG: {str(token.tag_)}
POS: {token.pos_}
EXPLAINATION: {spacy.explain(token.tag_)}"""
    )


TOKEN: Gus
TAG: NNP
POS: PROPN
EXPLAINATION: noun, proper singular

TOKEN: Proto
TAG: NNP
POS: PROPN
EXPLAINATION: noun, proper singular

TOKEN: is
TAG: VBZ
POS: AUX
EXPLAINATION: verb, 3rd person singular present

TOKEN: a
TAG: DT
POS: DET
EXPLAINATION: determiner

TOKEN: Python
TAG: NNP
POS: PROPN
EXPLAINATION: noun, proper singular

TOKEN: developer
TAG: NN
POS: NOUN
EXPLAINATION: noun, singular or mass

TOKEN: currently
TAG: RB
POS: ADV
EXPLAINATION: adverb

TOKEN: working
TAG: VBG
POS: VERB
EXPLAINATION: verb, gerund or present participle

TOKEN: for
TAG: IN
POS: ADP
EXPLAINATION: conjunction, subordinating or preposition

TOKEN: a
TAG: DT
POS: DET
EXPLAINATION: determiner

TOKEN: London
TAG: NNP
POS: PROPN
EXPLAINATION: noun, proper singular

TOKEN: -
TAG: HYPH
POS: PUNCT
EXPLAINATION: punctuation mark, hyphen

TOKEN: based
TAG: VBN
POS: VERB
EXPLAINATION: verb, past participle

TOKEN: Fintech
TAG: NNP
POS: PROPN
EXPLAINATION: noun, proper singular

TOKEN: company
TAG: NN
POS: N

In [160]:
# by using POS tags, you can extract a particular category of words

nouns = []
adjectives = []

for token in about_doc:
    if token.pos_ == "NOUN":
        nouns.append(token)
    if token.pos_ == "ADJ":
        adjectives.append(token)

print(f"nouns: {nouns}")
print(f"adjectives: {adjectives}")

nouns: [developer, company]
adjectives: [interested]


In [161]:
# Visualisation using displaCy
# you can use this to visualise dependency parse or named entities

from spacy import displacy

nlp = spacy.load("en_core_web_sm")

about_interested_text = (
    "Tom has a degree in statistical mathematics."
)

about_interested_doc = nlp(about_interested_text)

displacy.render(about_interested_doc, style="dep")

In [162]:
# Preprocessing Functions 
# bringing text into a format ideal for analysis
# 1. lowercase text
# 2. lemmatise each token
# 3. remove punctuation
# 4. remove stop words

nlp = spacy.load("en_core_web_sm")
complete_doc = nlp(complete_text)

def is_token_allowed(token):
    return bool(
        token 
        and str(token).strip()
        and not token.is_stop
        and not token.is_punct
    )

def preprocess_token(token):
    return token.lemma_.strip().lower()

complete_filtered_tokens = [
    preprocess_token(token)
    for token in complete_doc
    if is_token_allowed(token)
]

complete_filtered_tokens


['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'application',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

In [163]:
# Rule based matching 
# one of the steps in extracting information from unstructed text
# to identidy and extract tokens and phrases according to patterns and grammatical features
# you can use regex but rule based matching in spaCy is more powerful

# Extracting first and last names

import spacy 
nlp = spacy.load("en_core_web_sm")
from spacy.matcher import Matcher 


# names are always proper nouns so the pattern will be two consecutive proper nouns
def extract_full_name(nlp_doc):
    matcher = Matcher(nlp.vocab)
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    matcher.add("FULL NAME", [pattern])
    matches = matcher(nlp_doc)
    for _, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

# test
text = "Hello my name is Thomas Sharples"
doc = nlp(text)
print(extract_full_name(doc)) # and it bloody works mate 


Thomas Sharples


In [164]:
# Phone number extraction 
phone_text = "Hello my name is Thomas Sharples, please contact me on +447591216386"
phone_doc = nlp(phone_text)

def extract_phone_number(nlp_doc):
    matcher = Matcher(nlp.vocab)
    pattern = [{"TEXT": {"REGEX": r"(\+447\d{9}|07\d{9})"}}]
    matcher.add("PHONE_NUMBER", [pattern])
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

print(extract_phone_number(phone_doc))


+447591216386


In [165]:
# email extraction
def extract_email(nlp_doc):
    for token in nlp_doc:
        if token.like_email is True:
            return token.text

email_text = "Hello my email is tsharples101@gmail.com"
email_doc = nlp(email_text)
print(extract_email(email_doc))


tsharples101@gmail.com


In [166]:
# extracting url
def extract_url(nlp_doc):
    for token in nlp_doc:
        if token.like_url is True:
            return token.text

url_text = "Hello my github is github.com/tomsharples1"
url_doc = nlp(url_text)
print(extract_url(url_doc))


github.com/tomsharples1


In [199]:
def extract_all(nlp_doc):
    info = []
    info.append(extract_full_name(nlp_doc))
    info.append(extract_phone_number(nlp_doc))      
    info.append(extract_email(nlp_doc))
    info.append(extract_url(nlp_doc))
    return info

long_text = "hello my name is Aleeya Lone, my email is aleeya.lone@gmail.com and my number is 07904536514"
long_doc = nlp(long_text)

print(extract_all(long_doc))



['Aleeya Lone', '07904536514', 'aleeya.lone@gmail.com', None]


In [239]:
# dependancy parsing 
# extracting a dependency graph of the sentance to represent its grammatical structure
# it defines dependancy relationships between headwords and their dependents

# head of a sentance has no dependency and is called the root of the sentance
# the verb is usually the root of the sentance

# Dependencies can be mapped in a graph where,
# words are the nodes 
# grammatical relations are the edges
# ayyy graph theory 

import spacy 
nlp = spacy.load("en_core_web_sm")
piano_text = "Tom is learning the piano"
piano_doc = nlp(piano_text)

for token in piano_doc:
    print(
        f"""
TOKEN: {token.text}
{token.tag_ =}
{token.head.text =}
{token.dep_ =}
explaination = {spacy.explain(token.tag_)}"""

    )

# nsubj is the subkect of the word
# aux id sn auxiliary wod and its headword is a verb
# dobj is the direct object of the verb, headword is also a verb


TOKEN: Tom
token.tag_ ='NNP'
token.head.text ='learning'
token.dep_ ='nsubj'
explaination = noun, proper singular

TOKEN: is
token.tag_ ='VBZ'
token.head.text ='learning'
token.dep_ ='aux'
explaination = verb, 3rd person singular present

TOKEN: learning
token.tag_ ='VBG'
token.head.text ='learning'
token.dep_ ='ROOT'
explaination = verb, gerund or present participle

TOKEN: the
token.tag_ ='DT'
token.head.text ='piano'
token.dep_ ='det'
explaination = determiner

TOKEN: piano
token.tag_ ='NN'
token.head.text ='learning'
token.dep_ ='dobj'
explaination = noun, singular or mass


In [240]:
displacy.render(piano_doc, style =)

SyntaxError: invalid syntax (1270079343.py, line 1)

In [None]:
# Tree and Subtree Navigation
# the dependency graph has all the properties of a tree 
# this tree contains information about sentance structure and grammar.

# spaCy provides attribues like .children, .lefts, .rights and .subtree to make navigating the parse tree easier.

import spacy 
nlp = spacy.load("en_core_web_sm")

one_line_about_text = (
    "Tom Sharples is a Python developer"
    " currently working for a london-based fintech company"
)

one_line_about_doc = nlp(one_line_about_text)

# extrat children of 'developer'
print([token.text for token in one_line_about_doc[5].children])

['a', 'Python', 'working']


In [None]:
# extract previous neighboring node of 'developer'
print(one_line_about_doc[5].nbor(-1))

Python


In [None]:
# extract next neigboring node of 'developer'
print(one_line_about_doc[5].nbor())

# remebering that the words are the nodes

currently


In [None]:
# extracting all the tokens on the right of 'developer'
print([token.text for token in one_line_about_doc[5].lefts])
print([token.text for token in one_line_about_doc[5].rights])

['a', 'Python']
['working']


In [None]:
# Print subtree of 'developer'

print(list(one_line_about_doc[5].subtree))

[a, Python, developer, currently, working, for, a, london, -, based, fintech, company]


In [None]:
# Shallow Parsing / chunking
# the process of extracting phrases from unstructured text
# chunking groups of adjacent tokens in phrases on the basis of their PoS tags.

# Noun Phrase detection 

import spacy 
nlp = spacy.load("en_core_web_sm")

conference_text = (
    "There is a developer conference happening on 21 July 2019 in London"
)

conference_doc = nlp(conference_text)

# extract noun phrases

for chunk in conference_doc.noun_chunks:
    print(chunk)

# this is a nice way to summarise key information from the text


a developer conference
21 July
London


In [None]:
# Named Entity Recognition 
# is the process of locating names entities in unstructured text
# and then classifying them into predefined categories

piano_class_text = (
    "Great Paino Academy is situated"
    " in Mayfair or the City of London and has"
    " world-class piano instructors"
)

piano_class_doc = nlp(piano_class_text)

for ent in piano_class_doc.ents:
    print(
        f"""
        {ent.text = }
        {ent.start_char = }
        {ent.end_char = }
        {ent.label_ = }
        label explaination = {spacy.explain(ent.label_)}"""
    )



        ent.text = 'Great Paino Academy'
        ent.start_char = 0
        ent.end_char = 19
        ent.label_ = 'ORG'
        label explaination = Companies, agencies, institutions, etc.

        ent.text = 'Mayfair'
        ent.start_char = 35
        ent.end_char = 42
        ent.label_ = 'GPE'
        label explaination = Countries, cities, states

        ent.text = 'the City of London'
        ent.start_char = 46
        ent.end_char = 64
        ent.label_ = 'GPE'
        label explaination = Countries, cities, states


In [None]:
displacy.render(piano_class_doc, style = "ent")

In [None]:
# name redaction using NER 

survey_text = (
    "Out of 5 people surveyed, James Robert,"
    " Julie Fuller and Benjamin Brooks like"
    " apples. Kelly Cox and Matthew Evans"
    " like oranges."
)

survey_doc = nlp(survey_text)

def replace_name(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "REDACTED "
    return token.text_with_ws

def redact_names(nlp_doc):
    with nlp_doc.retokenize() as retokeniser:
        for ent in nlp_doc.ents:
            retokeniser.merge(ent)
    tokens = map(replace_name, nlp_doc)
    return "".join(tokens)

print(redact_names(survey_doc))

def is_name(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return True


Out of 5 people surveyed, REDACTED , REDACTED and REDACTED like apples. REDACTED and REDACTED like oranges.


In [125]:
# resume NLP parsing 

resume = "Thomas Sharples    Tsharples101@gmail.com       https://github.com/tomsharples1      07591216386  Education   BSc (Hons) in Mathematics with Statistics - University of Nottingham, UK                                                         Grade: First Class (1st)  Final Year Group Project: 74%     Key Modules:   Applied Statistical Modelling: General Linear Models, Logistic Regression and Modelling with Survival Data, both theoretically and in R.  Multivariate Analysis: Handling high dimensional data and performing PCA, Linear Discriminant Analysis and Clustering, both theoretically and in R.    Statistical Inference: Maximum Likelihood Estimation, Bayesian Inference and Markov Chain Monte Carlo.     Monk’s Walk Sixth Form - Hertfordshire, UK                                                                         A-Levels  Further Mathematics (A*), Mathematics (A*), Chemistry (C)      Technical Skills   • Python: Proficient in data manipulation, supervised & unsupervised machine learning, using APIs and numerical methods. Experienced with Jupyter Notebook and libraries such as NumPy, Scikit-learn, Pandas, and SerpApi.  • R: Proficient in regression models, hypothesis testing and machine learning techniques such as PCA and clustering. Skilled in data manipulation, visualisation and analysis using the tidyverse suite.   • Data Visualisation: Expertise in visualising data using libraries such as Matplotlib in Python and ggplot2 and factoextra in R.  • Languages: Highly skilled in French speaking, reading and writing developed in Interfaculty French Level 3 and Level 4. Conversational proficiency in German handling day to day interactions well and a developing knowledge of Arabic.    Projects  Data Science Project – Grade: 100%  Analysed a United Nations data set to explore the economic and health indicators of various countries using machine learning and multivariate analysis techniques in R.  • PCA: Plotting and interpreting principal components to understand the similarities between countries by GDP and life expectancy.   • Clustering: Assessing whether k-means, model based or hierarchical clustering methods yielded the best results in classifying countries based on GDP and life expectancy.   • Linear Discriminant Analysis: Predicting continent based on GDP, life expectancy and population data and assessing the predictive accuracy using a confusion matrix.   • Regression: Building a linear model using OLS, principal component and ridge regression to predict life expectancy in 2007 and determine which method gave the most accurate model.  Job Matcher Machine Learning Algorithm  Developed a program in Python that matches job descriptions with CVs using machine learning, outputting a table of jobs with the highest percentage matches. Using Jupyter Notebook to provide real-time updates to the table as new data was streamed from the API.  • APIs: Developed my knowledge of APIs by using SerpApi to web scrape data from Google Jobs to input into my machine learning algorithm.   • Machine Learning: Improved machine learning skills by learning text similarity algorithms and developed my understanding of Scikit-learn library.  • Future plans include developing this project into an NLP based application to improve the accuracy of the percentage match.  Final Year Applied Mathematical Modelling Group Projects - Grade 74%  Project 1: Modelling Javelin Trajectories, Project 2: Modelling Bumper Car Dynamics, Project 3: Modelling Spontaneous Ignition.   • Python Skills: Applied object-oriented programming to solve differential equations using numerical methods such as the Runge-Kutta method and the Finite Difference Method.  • Machine Learning: Employing machine learning polynomial regression to predict the flight path in project one, using Sci-kit learn, NumPy and Matplotlib libraries.  • Mathematical Report Writing: Following a model, analysis, results (MAR) structure to show our findings clearly and effectively.  • Presentation: Presenting our project results to a large audience, ensuring complex ideas were understandable to various mathematical levels.    Experience   U-Cycle Business Development Officer and Operations Advisor - Enactus Nottingham, 2024  • 1000% increase in reach on Instagram by managing and developing relationships between societies and companies, contributing to a £1400 increase in revenue since the previous year.  • Refined leadership skills while organising a competition in collaboration with an engineering society, engaging 50 participants and hosting multiple events including a final judging panel event.  • Performed analysis on CO2 emissions while running a social media campaign with sustainability society to promote Ucycle’s environmental benefits.  • Advised operational decisions including supplier selection, pricing negotiation and cost analysis.    Great British Cycling Team Cyclist – UK, 2019 - 2021  • Transferred Athletic Discipline built by working towards challenging goals and understanding the importance of the long game to my academic studies.  • Balanced training, travel and education developing my time management skills.   • Analysed Real Data from training by identifying patterns, allowing me to make informed decisions around training progress.   • Performed at the Highest Level and Succeeded under Pressure, highlighted by a European Bronze medal in the 1km time trial in 2021"

cv_text = """
John Doe
Email: johndoe@example.com
Phone: 123-456-7890

Experience:
- Software Engineer at TechCorp (2018 - Present)
  - Developed a customer relationship management system using Python and Django.
  - Led a team of 5 developers.
  - Improved system performance by 30%.

Skills:
- Programming: Python, Java, C++
- Web Development: HTML, CSS, JavaScript, Django, Flask
- Databases: MySQL, PostgreSQL
- Tools: Git, Docker, Jenkins
"""

In [126]:
import spacy 

def preprocessing(text):

    doc = nlp(text)

    tokens = [token.lemma_.strip().lower() for token in doc if not token.is_stop and not token.is_punct]

    preprocessed_text = " ".join(tokens)

    return preprocessed_text

In [127]:
resume = preprocessing(resume)

In [128]:
import spacy 
nlp = spacy.load("en_core_web_sm")
cv_text = preprocessing(cv_text)
cv_doc = nlp(cv_text)

In [131]:
from spacy.matcher import Matcher 
from collections import OrderedDict

matcher = Matcher(nlp.vocab)

def extract_skills(nlp_doc):
    pattern = [{"LOWER": {"IN": ['python', "r", "data visualisation", 
                                 "sql", "scala", "julia", "pandas", "scikit learn", "numpy", "spacy", "nlp",
                                 "llm", 'tidyverse', 'dplyr', 'ggplot2', 'ggplot', 'clustering', 'pca', 'regression',
                                 'logistic', 'statistical', 'statistics', 'bayesian', 'hypothesis testing', 'tensorflow',
                                 'pyspark', 'databricks', 'pytorch', 'k','nearest','neighbour', 'knn', 'random','forest', 'means',
                                 'dimensional','reduction', 'matplotlib', 'seaborn', 'powerbi', 'tableau', 'etl', 'elt', 'nosql',
                                 'warehousing', 'warehouse', 'lake', 'mining', 'cleaning', 'wrangling',
                                 'neural','network', 'chatbot', 'gpt', 'bert', 'ntlk','aws', 'gcp','azure', 'git', 'github', 
                                 'b testing', 'query', 'queries', 'api', 'apis', 'web', 'scraping', 'webscraping', 'mysql', 
                                 'machine', 'learn', 'learning' 'cloud', 'platform']}}]
    matcher.add("SKILLS", [pattern])
    matches = matcher(nlp_doc)

    skills = []
    for _, start, end in matches:
        span = nlp_doc[start:end]
        skills.append(span.text)
    
    # removing duplicates
    skills = list(OrderedDict.fromkeys(skills))

    return skills


In [133]:
resume_doc = nlp(resume)

print(extract_skills(resume_doc))

['statistics', 'statistical', 'logistic', 'regression', 'dimensional', 'pca', 'clustering', 'bayesian', 'python', 'machine', 'api', 'numpy', 'learn', 'pandas', 'r', 'tidyverse', 'matplotlib', 'ggplot2', 'k', 'web', 'nlp']
