In [1]:
#import required python libraries
import cohere
import pandas as pd
import requests
import datetime
from tqdm import tqdm

In [2]:
#Get the data
train_df = pd.read_json('../data/relations_training.txt')
test_df = pd.read_json('../data/relations_test.txt') 

In [3]:

def pre_process_tokens(tokens):
    enti={}
    for t in tokens:
        
        if t['entityLabel'] in enti.keys():
            enti[t['entityLabel']] = enti[t['entityLabel']] +','+t['text']
        else:
            enti[t['entityLabel']] = t['text']
    label = ""
    for k, v in enti.items():
        label += k+':'+v+"\n"
    return label

In [4]:
train_df['label'] = train_df['tokens'].apply(pre_process_tokens)
test_df['label'] = test_df['tokens'].apply(pre_process_tokens)

In [5]:
test_df

Unnamed: 0,document,tokens,relations,label
0,"\nCurrently holding a faculty, industry, or go...","[{'text': 'Ph.D.', 'start': 75, 'end': 80, 'to...","[{'child': 18, 'head': 14, 'relationLabel': 'D...","DIPLOMA:Ph.D.\nDIPLOMA_MAJOR:machine learning,..."
1,\n2+ years experience in the online advertisin...,"[{'text': '2+ years', 'start': 1, 'end': 9, 't...","[{'child': 7, 'head': 1, 'relationLabel': 'EXP...",EXPERIENCE:2+ years\nSKILLS:online advertising...
2,\nBA/BS\n5+ years of program or project manage...,"[{'text': '5+ years', 'start': 7, 'end': 15, '...","[{'child': 11, 'head': 5, 'relationLabel': 'EX...","EXPERIENCE:5+ years,2+ years\nSKILLS:project m..."
3,\nCurrently enrolled in a full-time degree pro...,"[{'text': 'Ph.D.', 'start': 801, 'end': 806, '...","[{'child': 140, 'head': 137, 'relationLabel': ...",DIPLOMA:Ph.D.\nDIPLOMA_MAJOR:Computer Science\n
4,\nCurrently enrolled in a full-time degree pro...,"[{'text': 'Ph.D.', 'start': 801, 'end': 806, '...","[{'child': 140, 'head': 137, 'relationLabel': ...",DIPLOMA:Ph.D.\nDIPLOMA_MAJOR:Computer Science\n
5,\nMS in a quantitative field such as Operation...,"[{'text': 'MS', 'start': 1, 'end': 3, 'token_s...","[{'child': 8, 'head': 1, 'relationLabel': 'DEG...","DIPLOMA:MS,PhD\nDIPLOMA_MAJOR:Operations Resea..."
6,"\nPh.D. with 5+ years of experience, MS with 7...",[],[],
7,\nGraduating with a Ph.D. in Computer Science ...,"[{'text': 'Ph.D.', 'start': 19, 'end': 24, 'to...","[{'child': 6, 'head': 4, 'relationLabel': 'DEG...",DIPLOMA:Ph.D.\nDIPLOMA_MAJOR:Computer Science\n
8,"\n12+ years of research experience, in-house o...","[{'text': '12+ years', 'start': 1, 'end': 10, ...","[{'child': 5, 'head': 1, 'relationLabel': 'EXP...","EXPERIENCE:12+ years\nSKILLS:research,Ph.D\nDI..."
9,"\nPhD with 3+ years, or MS with 5+ years of ex...","[{'text': '5+ years', 'start': 31, 'end': 39, ...","[{'child': 18, 'head': 10, 'relationLabel': 'E...","EXPERIENCE:5+ years,3+ years,3+ years,5+ years..."


Preprocess the data

In [7]:
train_doc = []
for i in range(train_df.shape[0]):
    ent = train_df.label.iloc[i]
    docu = train_df.document.iloc[i].replace("\n"," ")
    if len(ent)!=0:
        train_doc.append(docu+"\n\nExtracted Text:" +
                        '\n'+ent+"----\n")
    
with open('../data/data_entity.txt','w') as f:
    for item in train_doc:
        # write each item on a new line
        f.write("%s\n" % item.strip())
    print('Done')

Done


In [8]:
test_doc = test_df.document.apply(
    lambda x: x.replace("\n", " ")+'\n\nExtracted Text:').to_list()

In [9]:
import os, sys
sys.path.append(os.path.abspath(os.path.join('..')))
import config

In [10]:
api_key = config.cohere_api['api_key']
co = cohere.Client(api_key)

In [11]:
#@title Create the prompt (Run this cell to execute required code) {display-mode: "form"}

class cohereExtractor():
    def __init__(self, examples):
        self.examples = examples

    def make_prompt(self, example):
        prompt = self.examples + [example]
       
        return ("".join([str(exam) for exam in prompt]))

    def extract(self, example):
        extraction = co.generate(
            model='large',
            prompt=self.make_prompt(example),
            max_tokens=100,
            temperature=0.5,
            stop_sequences=["----"])
        
        return(extraction.generations[0].text[:-1])


##### Prepare Prompt

In [12]:
cohereJobExtractor = cohereExtractor(train_doc[:5])
print(cohereJobExtractor.make_prompt(test_doc[-1]))

MS/Ph.D. in CS/EE or related areas is required. Strong ability and effectiveness working in a significant technical problem domain, in the term of plan, design, execution, continuous release and service operation. Strong software engineering fundamentals, including coding, problem solving and data analysis skills. Background in machine learning/deep learning (strongly preferred). Passionate and self-motivated. Ability to effectively work in collaborative multiple project team environment and ship production features in a fast-paced environment. Good communication skills, both verbal and written. Customer/end result/Metrics driven in design and development. Strong ability in self-learning, entering new domain, managing through uncertainty in an innovative team environment

Extracted Text:
DIPLOMA:MS/Ph.D.
DIPLOMA_MAJOR:CS/EE
----
1+ years development experience on Java stack AppConnect / API's experience is added advantage. Compute, Network and Storage Monitoring Tools (Ex: Netcool) App

In [13]:
results = []
for text in tqdm(test_doc):
    try:
        extracted_text = cohereJobExtractor.extract(text)
        print(extracted_text)
        results.append(extracted_text)
    except Exception as e:
        print('ERROR: ', e)

  9%|▉         | 1/11 [00:04<00:40,  4.10s/it]


EXPERIENCE:1+ year(s)
SKILLS:research,programming,researcher,AI,machine learning,computer science,statistics,applied mathematics,data science,
DIPLOMA:Ph.D
DIPLOMA_MAJOR:CS,Statistics
---


 18%|█▊        | 2/11 [00:08<00:36,  4.04s/it]


EXPERIENCE:2+ years
SKILLS:online advertising,research
---


 27%|██▋       | 3/11 [00:11<00:31,  3.91s/it]


EXPERIENCE:5+ years
SKILLS:program,project
---


 36%|███▋      | 4/11 [00:15<00:27,  3.99s/it]


EXPERIENCE:1+ years
SKILLS:research,software engineering
DIPLOMA:Ph.D.
DIPLOMA_MAJOR:computer science
---


 45%|████▌     | 5/11 [00:19<00:23,  3.99s/it]


EXPERIENCE:Currently enrolled
SKILLS:Machine learning,AI,computer vision,natural language processing,computational neuroscience,optimization,computer science,statistics,applied mathematics,data science
---


 55%|█████▍    | 6/11 [00:23<00:19,  3.92s/it]


EXPERIENCE:7+ years
SKILLS:algorithms,optimization,R,Python
---


 64%|██████▎   | 7/11 [00:30<00:19,  4.84s/it]


EXPERIENCE:5+ years
SKILLS:Physics,Electrical Engineering,Computer Science,or a related technical field
---


 73%|███████▎  | 8/11 [00:34<00:13,  4.56s/it]


EXPERIENCE:Graduating with a Ph.D.
SKILLS:C/C++,Python,C#,Java
DIPLOMA:Ph.D.
DIPLOMA_MAJOR:CS
---


 82%|████████▏ | 9/11 [00:38<00:08,  4.30s/it]


EXPERIENCE:12+ years
SKILLS:research,research experience,human factors,cognitive psychology
DIPLOMA:Ph.D
DIPLOMA_MAJOR:human-computer interaction,human factors,cognitive psychology
---


 91%|█████████ | 10/11 [00:42<00:04,  4.21s/it]


EXPERIENCE:3+ years
SKILLS:research,analytics,management,finance,compensation,diversity
DIPLOMA:PhD
DIPLOMA_MAJOR:Industrial/Organizational Psychology,Organizational Behavior,Labor Economics,Management,Policy Analysis,
---


100%|██████████| 11/11 [00:48<00:00,  4.39s/it]


EXPERIENCE:1+ year(s)
SKILLS:research
---





In [14]:
pd.DataFrame(data={'text': test_doc, 'extracted_text': results})

Unnamed: 0,text,extracted_text
0,"Currently holding a faculty, industry, or gov...","\nEXPERIENCE:1+ year(s)\nSKILLS:research,progr..."
1,2+ years experience in the online advertising...,\nEXPERIENCE:2+ years\nSKILLS:online advertisi...
2,BA/BS 5+ years of program or project manageme...,"\nEXPERIENCE:5+ years\nSKILLS:program,project\..."
3,Currently enrolled in a full-time degree prog...,"\nEXPERIENCE:1+ years\nSKILLS:research,softwar..."
4,Currently enrolled in a full-time degree prog...,\nEXPERIENCE:Currently enrolled\nSKILLS:Machin...
5,MS in a quantitative field such as Operations...,"\nEXPERIENCE:7+ years\nSKILLS:algorithms,optim..."
6,"Ph.D. with 5+ years of experience, MS with 7+...","\nEXPERIENCE:5+ years\nSKILLS:Physics,Electric..."
7,Graduating with a Ph.D. in Computer Science o...,\nEXPERIENCE:Graduating with a Ph.D.\nSKILLS:C...
8,"12+ years of research experience, in-house or...","\nEXPERIENCE:12+ years\nSKILLS:research,resear..."
9,"PhD with 3+ years, or MS with 5+ years of exp...","\nEXPERIENCE:3+ years\nSKILLS:research,analyti..."


In [15]:
# Clean the data
def clean_test(x):
    x = "".join(x)
    x = x.lower()
    x = x.lstrip()
    x = x.split("\n")
    x = "".join(x)
    x = x.split(':')
    x = "".join(x)
    return x

def clean_extracted(x):
    x = "".join(x)
    x = x.lower()
    x = x[:-3]
    x = x.lstrip()
    
    x = x.split("\n")
    x = "".join(x)
    x = x.split(':')
    x = "".join(x)
   
    return x

In [16]:
from concurrent.futures import ThreadPoolExecutor

extracted = []
# Run the model to extract the entities
with ThreadPoolExecutor(max_workers=8) as executor:
    for i in executor.map(cohereJobExtractor.extract, test_df['document'].apply(
            lambda x: "Job: "+x.replace("\n", " ")+'\n\nExtracted Text:').to_list()):
        extracted.append(str(i).strip())

In [17]:
# Save results
test_df['extracted_text'] = extracted

In [18]:
test_df['extracted_text'] = test_df['extracted_text'].apply(clean_extracted)
test_df['label'] = test_df['label'].apply(clean_test)

In [19]:
from difflib import SequenceMatcher
score = []
for i in range(test_df.shape[0]):
    
    score.append(SequenceMatcher(None,test_df['label'].iloc[i],
                          test_df['extracted_text'].iloc[i]).ratio())

In [20]:
# Compare the label to the extracted text
test_df['similarity_score'] = pd.DataFrame(score)

# Print the accuracy
print(f'Classification accuracy {test_df["similarity_score"].mean() *100}%')

Classification accuracy 48.889085593277315%


### Finetuning a model