In [2]:
import pandas as pd
import re
import nltk
import spacy
from num2words import num2words
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag
from spacy.pipeline import EntityRecognizer

### Reading the .json File and Converting it to a Dataframe

In [3]:
df = pd.read_json('Resume.json', lines = True)
df.to_csv('dataframe.csv', index = None)
df.head()

Unnamed: 0,content,annotation
0,Govardhana K\nSenior Software Engineer\n\nBeng...,"[{'label': ['Companies worked at'], 'points': ..."
1,"Harini Komaravelli\nTest Analyst at Oracle, Hy...","[{'label': ['Companies worked at'], 'points': ..."
2,Hartej Kathuria\nData Analyst Intern - Oracle ...,"[{'label': ['Skills'], 'points': [{'start': 22..."
3,Ijas Nizamuddin\nAssociate Consultant - State ...,"[{'label': ['Skills'], 'points': [{'start': 46..."
4,"Imgeeyaul Ansari\njava developer\n\nPune, Maha...","[{'label': ['Skills'], 'points': [{'start': 18..."


### Understanding the Contents of Each Column

In [4]:
df['annotation'][0]

[{'label': ['Companies worked at'],
  'points': [{'start': 1749, 'end': 1754, 'text': 'Oracle'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1696, 'end': 1701, 'text': 'Oracle'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1417, 'end': 1422, 'text': 'Oracle'}]},
 {'label': ['Skills'],
  'points': [{'start': 1356,
    'end': 1792,
    'text': 'Languages: Core Java, Go Lang, Data Structures & Algorithms, Oracle\nPL-SQL programming, Sales Force with APEX.\nTools: RADTool, Jdeveloper, NetBeans, Eclipse, SQL developer,\nPL/SQL Developer, WinSCP, Putty\nWeb Technologies: JavaScript, XML, HTML, Webservice\n\nOperating Systems: Linux, Windows\nVersion control system SVN & Git-Hub\nDatabases: Oracle\nMiddleware: Web logic, OC4J\nProduct FLEXCUBE: Oracle FLEXCUBE Versions 10.x, 11.x and 12.x'}]},
 {'label': ['Companies worked at'],
  'points': [{'start': 1209, 'end': 1214, 'text': 'Oracle'}]},
 {'label': ['Skills'],
  'points': [{'start': 1136,
    'end': 1247,


In [5]:
df['content'][0]

'Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\nhttps://www.

### Creating the Functions for Text Cleaning and Word Tagging

In [6]:
def clean_text(resume):
    
    resume = resume.lower()
        
    resume = re.sub("\n", ' ', resume)
    resume = re.sub(r'[,•()➢❑]', ' ', resume)
    resume = re.sub(r'\s\s+|\s-\s|\.\s', ' ', resume)
    
    tokenized_words = resume.split(" ")
    
    length = len(tokenized_words)
    
    for i in range(length):
        if tokenized_words[i].isdigit():
            tokenized_words[i] = num2words(tokenized_words[i])
        
    sw = set(stopwords.words('english'))
    
    tokens_without_sw = []
    
    for w in tokenized_words:
        if w not in sw:
            tokens_without_sw.append(w)
    
    final_resume = " ".join(tokens_without_sw)
    
    return final_resume



def tag_words(text):
    
    nlp = spacy.load("en_core_web_sm")
    
    words = nlp(text)
    
    tagged = []
    
    for word in words:
        print(word, word.pos_)
        
    return tagged

### Cleaning the 'content' Column

In [15]:
content_resumes = df['content']

for i in range(0,200):
    content_resumes[i] = clean_text(content_resumes[i])
    
    
print(content_resumes[1])

harini komaravelli test analyst oracle hyderabad hyderabad telangana email indeed: indeed.com/r/harini- komaravelli/2659eee82e435d1b six yrs experience manual automation testing work experience qa analyst oracle test analyst oracle hyderabad infosys ltd hyderabad telangana november two thousand eleven february two thousand sixteen hyderabad nov two thousand eleven feb17 two thousand sixteen worked tata consultancy services hyderabad feb twenty-four apr eleven two thousand seventeen currently working test analyst oracle hyderabad qa analyst six years experience oracle education mca osmania university b.sc computer science osmania university skills functional testing blue prism qtp additional information area expertise: familiar agile methodologies knowledge energy petroleum & health care domains involved preparation test scenarios preparing test data test cases https://www.indeed.com/r/harini-komaravelli/2659eee82e435d1b?isid=rex-download&ikw=download-top&co=in https://www.indeed.com/r/

In [29]:
### We use the Tf-Idf vectorizer to gain insights about the probabilities of all the possible words and collocations
### we can find in these reusmes.

vect = TfidfVectorizer(ngram_range = (1, 3))
tf_idf = vect.fit_transform(content_resumes)
terms = vect.get_feature_names()
#print(terms)
print(tf_idf)

  (0, 344)	0.03951386126173482
  (0, 272)	0.03951386126173482
  (0, 137)	0.03951386126173482
  (0, 118961)	0.03951386126173482
  (0, 42744)	0.03951386126173482
  (0, 73562)	0.03951386126173482
  (0, 42739)	0.03951386126173482
  (0, 81442)	0.03951386126173482
  (0, 71644)	0.03951386126173482
  (0, 61490)	0.03951386126173482
  (0, 120306)	0.03951386126173482
  (0, 66842)	0.03951386126173482
  (0, 73643)	0.03951386126173482
  (0, 27654)	0.03951386126173482
  (0, 49133)	0.03951386126173482
  (0, 45465)	0.03951386126173482
  (0, 104714)	0.03951386126173482
  (0, 105311)	0.03951386126173482
  (0, 23571)	0.03951386126173482
  (0, 118930)	0.036658060621811935
  (0, 121191)	0.036658060621811935
  (0, 60985)	0.036658060621811935
  (0, 105497)	0.03306017731676869
  (0, 72954)	0.028920236542092175
  (0, 120482)	0.03951386126173482
  :	:
  (199, 107575)	0.014129232160370724
  (199, 52326)	0.008535110269151214
  (199, 3294)	0.008942960985799369
  (199, 123416)	0.012043619073145882
  (199, 72322)	0.0

In [41]:
### For example the term at column 39401 has a probability of 2.7% of appearing in our set of resumes.

print (terms[39401])

experience


In [30]:
### Showing the part of speech that these terms belong to

nltk.pos_tag(terms)

[('000', 'CD'),
 ('000 servers', 'NNS'),
 ('000 servers trouble', 'CD'),
 ('000members', 'NNS'),
 ('000members evaluatedpatientcareneeds', 'CD'),
 ('000members evaluatedpatientcareneeds prioritizedtreatment', 'CD'),
 ('0023411a049a1441', 'CD'),
 ('0023411a049a1441 challenging', 'VBG'),
 ('0023411a049a1441 challenging career', 'CD'),
 ('0023411a049a1441 isid', 'CD'),
 ('0023411a049a1441 isid rex', 'CD'),
 ('005e1ab800b4cb42', 'CD'),
 ('005e1ab800b4cb42 isid', 'CD'),
 ('005e1ab800b4cb42 isid rex', 'CD'),
 ('005e1ab800b4cb42 work', 'CD'),
 ('005e1ab800b4cb42 work experience', 'CD'),
 ('00f125c7b9b95a35', 'CD'),
 ('00f125c7b9b95a35 isid', 'CD'),
 ('00f125c7b9b95a35 isid rex', 'CD'),
 ('00f125c7b9b95a35 two', 'CD'),
 ('00f125c7b9b95a35 two year', 'CD'),
 ('01', 'CD'),
 ('01 crm', 'CD'),
 ('01 crm 02', 'CD'),
 ('01 dopra', 'CD'),
 ('01 dopra description', 'CD'),
 ('01 ethernet', 'CD'),
 ('01 ethernet routing', 'VBG'),
 ('01 tnc', 'CD'),
 ('01 tnc controller', 'CD'),
 ('01st', 'CD'),
 ('01st 

In [31]:
### Displaying the features table, where columns are the possible mono-, bi- and tri-grams in all of the resumes.
### We try to identify the most common words and collocations to use in our NER model, later.

pd.DataFrame.sparse.from_spmatrix(tf_idf, index = content_resumes, columns=terms)[0:2]

Unnamed: 0_level_0,000,000 servers,000 servers trouble,000members,000members evaluatedpatientcareneeds,000members evaluatedpatientcareneeds prioritizedtreatment,0023411a049a1441,0023411a049a1441 challenging,0023411a049a1441 challenging career,0023411a049a1441 isid,...,zoom,zoom knowledge,zoom knowledge webrtc,zoom supporting,zoom supporting bada2,zoom text,zoom text size,zxf05u01,zxf05u01 validating,zxf05u01 validating vendor
content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
govardhana k senior software engineer bengaluru karnataka karnataka email indeed: indeed.com/r/govardhana-k/ b2de315d95905b68 total experience five years six months cloud lending solutions inc four month salesforce developer oracle five years two month core java developer languages core java go lang oracle pl-sql programming sales force developer apex designations & promotions willing relocate: anywhere work experience senior software engineer cloud lending solutions bangalore karnataka january two thousand eighteen present present senior consultant oracle bangalore karnataka november two thousand sixteen december two thousand seventeen staff consultant oracle bangalore karnataka january two thousand fourteen october two thousand sixteen associate consultant oracle bangalore karnataka november two thousand twelve december two thousand thirteen education b.e computer science engineering adithya institute technology tamil nadu september two thousand eight june two thousand twelve https://www.indeed.com/r/govardhana-k/b2de315d95905b68?isid=rex-download&ikw=download-top&co=in https://www.indeed.com/r/govardhana-k/b2de315d95905b68?isid=rex-download&ikw=download-top&co=in skills apex less one year data structures three years flexcube five years oracle five years algorithms three years links https://www.linkedin.com/in/govardhana-k-61024944/ additional information technical proficiency: languages: core java go lang data structures & algorithms oracle pl-sql programming sales force apex tools: radtool jdeveloper netbeans eclipse sql developer pl/sql developer winscp putty web technologies: javascript xml html webservice operating systems: linux windows version control system svn & git-hub databases: oracle middleware: web logic oc4j product flexcube: oracle flexcube versions 10.x 11.x 12.x https://www.linkedin.com/in/govardhana-k-61024944/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
harini komaravelli test analyst oracle hyderabad hyderabad telangana email indeed: indeed.com/r/harini- komaravelli/2659eee82e435d1b six yrs experience manual automation testing work experience qa analyst oracle test analyst oracle hyderabad infosys ltd hyderabad telangana november two thousand eleven february two thousand sixteen hyderabad nov two thousand eleven feb17 two thousand sixteen worked tata consultancy services hyderabad feb twenty-four apr eleven two thousand seventeen currently working test analyst oracle hyderabad qa analyst six years experience oracle education mca osmania university b.sc computer science osmania university skills functional testing blue prism qtp additional information area expertise: familiar agile methodologies knowledge energy petroleum & health care domains involved preparation test scenarios preparing test data test cases https://www.indeed.com/r/harini-komaravelli/2659eee82e435d1b?isid=rex-download&ikw=download-top&co=in https://www.indeed.com/r/harini-komaravelli/2659eee82e435d1b?isid=rex-download&ikw=download-top&co=in experienced development execution test cases effectively experienced functional testing gui testing smoke testing regression testing integration testing experienced accessibility testing application ability understand user requirements functional design specifications good knowledge sdlc stlc processes deciding severity priority bugs experience using microsoft test manager & oracle test manager test management tools good experience testing windows based & web based applications involved client interactions reviews issues clarifications web services testing writing test scripts qtp testcomplete creating object repositories function libraries qtp enhanced qtp scripts using vb script strong experience working blue prism tool worked different environments like windows application & web application technical skills: test automation tools: blue prism qtp 10.0 testcomplete test management tool: microsoft test manager oracle test manager & jira databases: oracle 10g sql server operating systems: windows seven project 1: title: cadence client: baker hughes technologies: microsoft visual studio microsoft team foundation server client background: oilfield services company delivering focused efforts shale gas oilfield services provides services tools software drilling formation evaluation well completion production management seismic data collection interpretation project description: aut application test next generation revolutionary robust easy use scalable well site data acquisition processing interpretation system client's drilling services deliver services meets cross divisional business requirements consistently project 2: description: paragon supports entire care team one tool clinicians need help deliver best patient care designed physicians nurses pharmacists mid level providers first-hand understanding clinical workflow needs paragon clinical applications allow caregivers focus matters most; spending time caring patients since paragon fully-integrated across applications built around single patient database information entered anywhere system immediately available entire care team immediate access helps clinicians make better treatment decisions also helps promote patient safety paragon offers broad suite multidisciplinary clinical software solutions together anytime anywhere access complete patient record responsibilities: performed smoke testing regression testing involved generating executing test script using quick test pro & blue prism usability user interface testing involved defect tracking reporting bugs using tfs participated frequent walk-through meetings internal quality assurance groups development groups participated client calls clarifying doubts at&t sessions involved functional regression smoke testing validate application data changes done windows application certifying build status running scripts part smoke testing project 3: description: food & beverages r&a: easily manage business across multiple locations reducing cost complexity cloud-based point-of-sale pos solutions enable centralized enterprise management lower upfront costs smaller footprint responsibilities: performed functional testing regression testing involved generating executing test scripts using blue prism tool open script involved preparing bots using blue prism tool accessibility testing web application involved defect tracking reporting bugs using jira webservices testing calling api's export data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
### This specific resume is causing a great deal of problems because of the issue of no spaces (delimiters) between the words.

content_resumes[154]

"jacob philip kottayam kerala email indeed: indeed.com/r/jacob-philip/db00d831146c9228 strategicsales experienceinsales skills.currently success work experience sales marketing specialist assistantbusinessdevelopmentmanager dubai ae february two thousand seventeen october two thousand seventeen uae -builtstrong clientrelationshipsandprovidedhighvalue-addingservices resultingina15% company marketshareincrease developstools practicesacrosstheorganization negotiatingcontractsandpackages negotiatingthetermsofanagreementwithaviewto closingsale expense andnew businessdata workedcloselywithpartners throughconductingqualityassurancetests.actasthepointofcontactandcommunicate projectstatustoallparticipantsinourteam ordinator marketingco january two thousand fifteen june two thousand sixteen bhimajewelers kerala india systemreportforms plannedandexecutedeventsandmarketingprograms producingfivetimestargetnumberof qualifiedleads implements marketing advertising campaigns assembling analyzing sales 

In [43]:
### Final Clean Resume Sample

print(content_resumes[1])

harini komaravelli test analyst oracle hyderabad hyderabad telangana email indeed: indeed.com/r/harini- komaravelli/2659eee82e435d1b six yrs experience manual automation testing work experience qa analyst oracle test analyst oracle hyderabad infosys ltd hyderabad telangana november two thousand eleven february two thousand sixteen hyderabad nov two thousand eleven feb17 two thousand sixteen worked tata consultancy services hyderabad feb twenty-four apr eleven two thousand seventeen currently working test analyst oracle hyderabad qa analyst six years experience oracle education mca osmania university b.sc computer science osmania university skills functional testing blue prism qtp additional information area expertise: familiar agile methodologies knowledge energy petroleum & health care domains involved preparation test scenarios preparing test data test cases https://www.indeed.com/r/harini-komaravelli/2659eee82e435d1b?isid=rex-download&ikw=download-top&co=in https://www.indeed.com/r/

In [13]:
### NLTK with word tokenization results in seperating urls

tokenized_words = word_tokenize(content_resumes[1])

nltk.pos_tag(tokenized_words)

[('harini', 'NN'),
 ('komaravelli', 'JJ'),
 ('test', 'NN'),
 ('analyst', 'NN'),
 ('oracle', 'NN'),
 ('hyderabad', 'NN'),
 ('hyderabad', 'NN'),
 ('telangana', 'NN'),
 ('email', 'VBP'),
 ('indeed', 'RB'),
 (':', ':'),
 ('indeed.com/r/harini-', 'JJ'),
 ('komaravelli/2659eee82e435d1b', 'NN'),
 ('six', 'CD'),
 ('yrs', 'NN'),
 ('experience', 'NN'),
 ('manual', 'JJ'),
 ('automation', 'NN'),
 ('testing', 'VBG'),
 ('work', 'NN'),
 ('experience', 'NN'),
 ('qa', 'JJ'),
 ('analyst', 'NN'),
 ('oracle', 'NN'),
 ('test', 'NN'),
 ('analyst', 'NN'),
 ('oracle', 'NN'),
 ('hyderabad', 'NN'),
 ('infosys', 'NN'),
 ('ltd', 'NN'),
 ('hyderabad', 'NN'),
 ('telangana', 'JJ'),
 ('november', 'RB'),
 ('two', 'CD'),
 ('thousand', 'NN'),
 ('and', 'CC'),
 ('eleven', 'RB'),
 ('february', 'JJ'),
 ('two', 'CD'),
 ('thousand', 'NN'),
 ('and', 'CC'),
 ('sixteen', 'JJ'),
 ('hyderabad', 'NN'),
 ('nov', 'RB'),
 ('two', 'CD'),
 ('thousand', 'NN'),
 ('and', 'CC'),
 ('eleven', 'RB'),
 ('feb17', 'JJ'),
 ('two', 'CD'),
 ('thousa

In [14]:
### Using spacy tagging is prone to errors

tag_words(content_resumes[1])

harini ADJ
komaravelli NOUN
test NOUN
analyst NOUN
oracle NOUN
hyderabad ADJ
hyderabad ADJ
telangana NOUN
email NOUN
indeed ADV
: PUNCT
indeed.com/r/harini- NOUN
komaravelli/2659eee82e435d1b NOUN
six NUM
yrs NOUN
experience NOUN
manual ADJ
automation NOUN
testing NOUN
  SPACE
work NOUN
experience NOUN
qa ADP
analyst NOUN
oracle NOUN
test NOUN
analyst NOUN
oracle NOUN
hyderabad NOUN
infosys NOUN
ltd NOUN
  SPACE
hyderabad ADJ
telangana NOUN
  SPACE
november NOUN
two NUM
thousand NUM
and CCONJ
eleven NUM
february NOUN
two NUM
thousand NUM
and CCONJ
sixteen NUM
hyderabad NOUN
nov NOUN
two NUM
thousand NUM
and CCONJ
eleven NUM
feb17 VERB
two NUM
thousand NUM
and CCONJ
sixteen NUM
worked VERB
tata NOUN
consultancy NOUN
services NOUN
hyderabad NOUN
feb NOUN
twenty NUM
- PUNCT
four NUM
apr NOUN
eleven NUM
two NUM
thousand NUM
and CCONJ
seventeen NUM
currently ADV
working VERB
test NOUN
analyst NOUN
oracle NOUN
hyderabad NOUN
qa PUNCT
analyst NOUN
six NUM
years NOUN
experience NOUN
oracle NOUN

[]