In [2]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import streamlit as st
import os
import openai

In [3]:
resume = pd.read_csv(r'raw_data\resume_data_231228.csv')
jobs = pd.read_csv(r'raw_data\jobs_info.csv', index_col=0)
resume.dropna(inplace=True)
resume.drop_duplicates(inplace=True)
resume.sample()

Unnamed: 0,Title,Raw_html,Resume
6244,"pd / casualty / casualty relief adjuster, casu...","<div class=""document fontsize fontface vmargin...","Jessica Claire Montgomery Street , San Francis..."


In [4]:
# shortlisting just one job the pcoess can be applied for all the jobs
resume = resume[resume['Resume'].str.contains('Canada')]
resume = resume[resume['Resume'].str.contains('Python')]
resume = resume[resume['Title'].str.contains('data scientist')]
resume.reset_index(drop=True, inplace=True)
resume['Raw_html'][0]

'<div class="document fontsize fontface vmargins hmargins pagesize skn-mlc1 MLC1"><div class="topsection" id="CONTAINER_PARENT_0"><div class="left-box" id="CONTAINER_0"><div class="section notdraggable section SECTION_NAME firstsection" data-section-cd="NAME" id="SECTION_NAME99faf10c-1f91-43f4-b8a0-2d5a467a4d36"><div class="paragraph PARAGRAPH_NAME firstparagraph" id="PARAGRAPH_NAME_20ed3e20-9261-43dc-b03a-d41abc18ab8c"><div> <div class="name"> <div dependency="FNAM" id="FIELD_FNAM">Jessica</div> <div dependency="LNAM" id="FIELD_LNAM">Claire</div> <div class="profTitle hide" dependency="DCTL"><span class="placeholder-text" id="FIELD_DCTL"></span></div> </div> </div></div></div><div class="section SECTION_CNTC notdraggable section SECTION_CNTC" data-section-cd="CNTC" id="SECTION_CNTC2fc0e61d-0dbc-4ae7-bb8f-b060d19db30e"><div class="paragraph PARAGRAPH_CNTC firstparagraph" id="PARAGRAPH_CNTC_006a9b37-3700-4d10-8b4f-28184a693cf2"><div class="clearfix doc-item"> <div class="address"> <div 

In [5]:
jobs.sample()

Unnamed: 0,organization_name,title,locations,organization_topics,organization_industry_tags,url,has_description,description
2625,GFIT Wellness,Physiotherapist,"['Winnipeg, MB, Canada']","['10-49', 'B2B', 'B2C', 'Healthcare & wellness']","['Education', 'Health', 'Sports', 'Wellness an...",https://www1.communitech.ca/companies/gfit-wel...,True,"<div data-testid=""careerPage""><img height=""1"" ..."


In [6]:
jobs = jobs[jobs['has_description'] == True]
jobs = jobs[['organization_name', 'title', 'locations', 'organization_topics',
             'organization_industry_tags', 'description', 'url']]
jobs.reset_index(inplace=True, drop=True)
jobs.sample()

Unnamed: 0,organization_name,title,locations,organization_topics,organization_industry_tags,description,url
3930,MarketBox,Customer Success Specialist,['Remote'],"['Agnostic (all)', 'B2B', 'Data & analytics', ...","['Finance', 'Information Technology', 'Softwar...","<div data-testid=""careerPage""><p><strong>About...",https://www1.communitech.ca/companies/marketbo...


In [7]:
# Check the column names
for col in jobs.columns:
    jobs[col] = jobs[col].astype(str)

In [8]:
# Iterate over each row and parse HTML content in the 'description' column
for row, desc in enumerate(jobs['description']):
    bs = BeautifulSoup(desc, 'html.parser')
    # Update the 'description' column with the parsed text
    jobs.loc[row, 'description'] = bs.get_text()
jobs.sample()

Unnamed: 0,organization_name,title,locations,organization_topics,organization_industry_tags,description,url
3258,Uken Games,Express Your Interest!,[],"['B2C', 'Gaming & esports', 'Software']","['Gaming', 'Media', 'Software', 'Travel']",We're always open to hearing from individuals...,https://www1.communitech.ca/companies/uken-gam...


In [9]:
# Location filter for Toronto
jobs = jobs[jobs['locations'].str.contains('Toronto')]
jobs.sample()

Unnamed: 0,organization_name,title,locations,organization_topics,organization_industry_tags,description,url
2820,Dataperformers,"Analyst/Senior Associate, Financial Instrument...","['Toronto, ON, Canada']","['AI/ML', 'B2B', 'Media & entertainment', 'Saa...","['DeepTech', 'Finance', 'Information Technolog...",Job Type: Permanent \nReference code: 123549 ...,https://www1.communitech.ca/companies/dataperf...


In [10]:
# Text Processing Functions
def text_preprocessing(text):
    # Convert text to lowercase
    text = str(text).lower()
    # Remove newline characters
    text = text.replace('\n', ' ').replace('\t', ' ')
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in string.punctuation]
    # Join tokens back into text
    return ' '.join(tokens)

In [11]:
# This step can be skipped for full deplyment
# Apply Text Preprocessing for shortlisting the jobs
resume['Resume_2'] = resume['Resume'].apply(text_preprocessing)
jobs['description_2'] = jobs['description'].fillna('').apply(text_preprocessing)

In [12]:
# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
all_texts = pd.concat([resume['Resume_2'], jobs['description_2']])
vectorizer.fit(all_texts)

In [13]:
# Transform resumes and job postings
resumes_tfidf = vectorizer.transform(resume['Resume_2'])
job_postings_tfidf = vectorizer.transform(jobs['description_2'])

In [14]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(resumes_tfidf, job_postings_tfidf)
# Normalize the similarity matrix
normalized_similarity_matrix = similarity_matrix / np.max(similarity_matrix)

In [15]:
# Top 15 jobs
applied = jobs.iloc[np.argsort(normalized_similarity_matrix[0])[::-1][:10]]
applied.reset_index(drop=True, inplace=True)
applied.sample(2)

Unnamed: 0,organization_name,title,locations,organization_topics,organization_industry_tags,description,url,description_2
4,Dataperformers,"Data Quality Analyst, Deloitte Global Technolo...","['Toronto, ON, Canada']","['AI/ML', 'B2B', 'Media & entertainment', 'Saa...","['DeepTech', 'Finance', 'Information Technolog...",Job Type: Permanent \nReference code: 124035 ...,https://www1.communitech.ca/companies/dataperf...,job type perman refer code 124035 primari loca...
9,Interac,Analytics Engineering Senior Specialist,"['Toronto, ON, Canada']",[],['Finance'],"Analytics Engineering, Senior SpecialistAt Int...",https://www1.communitech.ca/companies/interac/...,analyt engin senior specialistat interac desig...


In [16]:
resume.columns

Index(['Title', 'Raw_html', 'Resume', 'Resume_2'], dtype='object')

In [17]:
applied = applied[['organization_name', 'title', 'locations', 'organization_topics',
       'organization_industry_tags', 'description', 'url', ]]
applied.columns = ['Company', 'Job_title', 'location', 'tags','industry', 'description', 'url']
resume = resume[['Title', 'Raw_html', 'Resume']]
resume.columns = ['Title', 'html', 'Resume']

In [6]:
# Save dataframes to CSV files
# resume.to_csv('DataResume/ResumeDisplay.csv', index=False)
# applied.to_csv('DataJobs/JobsDisplay.csv', index=False)

In [18]:
resume = resume[['Title','Resume']]
applied = applied[['Company', 'Job_title', 'location', 'tags', 'industry', 'description']]

In [19]:
# applied['description'] = applied['Job_title'] + '. Job details: ' + applied['description']

# applied['description'] = applied['Job_title'] + 'at the company - ' + applied['Company'] + ' with description: ' + applied['description']

# # Select relevant columns for jobs DataFrame
# applied = applied[['Job_title', 'description']]

In [None]:
<pause run all>

We can use the following code in the pipeline for summarizing the job & resume. But could loose key information. <br>
The process in not controlled and we need an SFT to identify the key information.

In [20]:
import openai
import os

openai.api_key = os.environ.get("OPENAI_API_KEY")

def summarize_text(prompt, key):
    prep_j = "i need to process job posting as embedding for chatgpt so summarize into the core key valuable information of at least 600 words of clean text (no bullet points, no headings) for smaller computation requirements and faster response:"
    prep_r = "i need to process my resume as embedding for chatgpt so tell me the summarized core key valuable information of at least 600 words of clean text (no bullet points, no headings) for smaller computation requirements and faster response:"

    client = openai.OpenAI()
    prep = prep_r if key == 'r' else prep_j

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a text summarizer."},
            {"role": "user", "content": prep + prompt}
        ],
        model="gpt-3.5-turbo",
        max_tokens=1024,
        n=1,
        stop=None,
        temperature=0.5
    )

    # Make sure to return the text content of the response directly
    return response.choices[0].message.content

# Example usage with a pandas DataFrame
# Make sure your DataFrame is properly defined and imported before using these lines
resume['Resume'] = resume['Resume'].apply(lambda x: summarize_text(x, key='r'))
applied['description'] = applied['description'].apply(lambda x: summarize_text(x, key='j'))


In [39]:
# resume.to_csv('./DataResume/ResumeStorage.csv', index=False)
applied.to_csv('./DataJobs/JobsStorage.csv', index=False)

In [3]:
from llama_index.core import SimpleDirectoryReader
# from llama_index.readers import PandasCSVReader
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SimilarityPostprocessor

PERSIST_DIR_J = './storageDataJobs'
DATA_DIR_J = 'DataJobs'

PERSIST_DIR_R = './storageDataResume'
DATA_DIR_R = 'DataResume'



documents_j = SimpleDirectoryReader( #documents
    input_dir = './DataJobs',
    filename_as_id= True,
    input_files = None,
    # exclude = 'resume.csv',
    exclude_hidden = True, 
    errors = "ignore",
    recursive = True, # for sub folders 
    encoding = "utf-8",
    # required_exts = ['.csv'],
    # file_extractor = ['.csv', PandasCSVReader],
    
).load_data()
documents_r = SimpleDirectoryReader( #documents
    input_dir = './DataResume',
    filename_as_id= True,
    input_files = None,
    # exclude = 'resume.csv',
    exclude_hidden = True, 
    errors = "ignore",
    recursive = True, # for sub folders 
    encoding = "utf-8",
    # required_exts = ['.csv'],
    # file_extractor = ['.csv', PandasCSVReader],
    
).load_data()
index_j = VectorStoreIndex.from_documents(documents_j, show_progress=True)
index_j.storage_context.persist(persist_dir=PERSIST_DIR_J)


index_r = VectorStoreIndex.from_documents(documents_r, show_progress=True)
index_r.storage_context.persist(persist_dir=PERSIST_DIR_R)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 34.69it/s]
Generating embeddings: 100%|██████████| 21/21 [00:00<00:00, 24.69it/s]
Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 69.03it/s]
Generating embeddings: 100%|██████████| 15/15 [00:00<00:00, 34.31it/s]


In [19]:
from llama_index.core import SimpleDirectoryReader
# from llama_index.readers import PandasCSVReader
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SimilarityPostprocessor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
import os

Fine tuning the job desctriotiuon using text Splitter to havdle more jobs. <br>
This step is not necessary bnut improves the reaction time <br>
It depends onw hat is expected witht he chatbot . this uupgrade is not necessary but since latency is considered we parse the doc to finer text chunck. <br>

In [20]:



PERSIST_DIR_J = './storageDataJobs'
DATA_DIR_J = 'DataJobs'

documents_j = SimpleDirectoryReader( 
    input_dir = './DataJobs', filename_as_id= True,
    input_files = None, exclude_hidden = True, 
    errors = "ignore", recursive = True,  
    encoding = "utf-8").load_data()

text_splitter = TokenTextSplitter(separator=" ", 
                                  chunk_size=512, chunk_overlap=128)


pipeline = IngestionPipeline(transformations=[text_splitter])

nodes_j = pipeline.run(
    documents=documents_j, in_place=True,
    show_progress=True)

# build index
index_j = VectorStoreIndex(nodes_j)
index_j.storage_context.persist(persist_dir=PERSIST_DIR_J)


  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 2/2 [00:00<00:00,  2.80it/s]


In [None]:
<pause run all>

pine cone index

In [4]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
import os

In [15]:
from pinecone import Pinecone
host = "https://dotslive-epx6c5s.svc.gcp-starter.pinecone.io"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
index = pc.Index(name="dotslive",host=host)

In [27]:
import pandas as pd
from openai import OpenAI
import os
import openai

openai.api_key = os.environ.get("OPENAI_API_KEY")

client = OpenAI()


def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")  
    response = client.embeddings.create(input=[text], model=model)

    return response.data[0].embedding


# Load the job data from CSV
df = pd.read_csv('./DataJobs/JobsStorage.csv')


df['ada_embedding'] = df['description'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

# Save the DataFrame with embeddings to a CSV file
df.to_csv('./DataJobs/embedded_Jobs.csv', index=False)

In [2]:
import pandas as pd
df = pd.read_csv('./DataJobs/JobsStorage.csv')
df.head(3)

Unnamed: 0,Job_title,description
0,Data Architect,The job posting is for a Data Architect positi...
1,Senior Data Engineer,"Plooto, a company focused on helping Small and..."
2,"Senior Consultant, Risk and Regulatory Consult...",Deloitte is seeking a Senior Consultant for Ri...


In [33]:
import json

vectors = [
    {"id": str(index), "values": json.loads(json.dumps(row['ada_embedding']))}
    for index, row in df.iterrows()
]

upsert_response = index.upsert(
    vectors=vectors,
    namespace="jobs_vector",
)

upsert_response


{'upserted_count': 10}

we have uploaded the vectors into pinecone