In [2]:
import nltk
import numpy as np
import pandas as pd
import time



In [2]:
df = pd.read_json("job_postings_prepped.json")
df.head()

Unnamed: 0,Data,Vector
0,Licensed Insurance Agent While many industries...,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Sales Manager Are dynamic creative marketing p...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Model Risk Auditor Join Us Model Risk Auditor ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Business Manager Business ManagerFirst Baptist...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,NY Studio Assistant YOU COULD BE ONE OF THE MA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [45]:
memory_usage = df.memory_usage(deep=True).sum()
print(f"Memory usage of the DataFrame: {memory_usage/1024/1024/1024} GB")

Memory usage of the DataFrame: 13.41672064177692 GB


### Import the files from pre-processing

In [1]:
import json
with open('inverted_index_processed.json', 'r') as file:
    inverted_index_processed = json.load(file)

with open('tokenized_nostopw_corpus.json', 'r') as file:
    tokenized_nostopw_corpus = json.load(file)

In [51]:
import sys

memory_usage = sys.getsizeof(inverted_index_processed)
print(f"Memory usage of the dictionary: {memory_usage/1024/1024} MB")

Memory usage of the dictionary: 10.000091552734375 MB


In [5]:
def printKeys(obj, number):
    if isinstance(obj, list):
        for row in obj[:3]:
            print(row)
    else:
        first_3_keys = list(obj.keys())[:number]
        print(first_3_keys)

In [6]:
printKeys(inverted_index_processed, 3)

['licensed', 'insurance', 'agent']


### Similarity Search

In [3]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

# define a function to calculate semantic similarity between two words using WordNet
def calculate_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if not synsets1 or not synsets2:
        return 0.0
    max_sim = -1
    for synset1 in synsets1:
        for synset2 in synsets2:
            sim = wn.path_similarity(synset1, synset2)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return max_sim
    
# define a function to perform semantic matching of a query against a document
def semantic_matching(query):
    documents = inverted_index_processed

    scores = []

    query_tokens = word_tokenize(query)

    for document in documents:
        doc_tokens = word_tokenize(document)
        similarity_score = 0.0
        for query_token in query_tokens:
            max_sim = -1
            for doc_token in doc_tokens:
                # sim = calculate_similarity(query_token, doc_token)
                sim = calculate_similarity_word2vec(query_token, doc_token)
                if sim > max_sim:
                    max_sim = sim
            similarity_score += max_sim
        scores.append((document, similarity_score / len(query_tokens)))
    return scores

### Calculate the accuracy for the first query

In [9]:
with open('rst.json', 'r') as file:
    rst = json.load(file)

In [10]:
printKeys(rst, 3)

['licensed', 0.1898148148148148]
['insurance', 0.14814814814814814]
['agent', 0.15925925925925927]


In [13]:
tokenized_nostopw_corpus_lower = [[item.lower() for item in inner_list] for inner_list in tokenized_nostopw_corpus]

In [14]:
# df['value'] = 0
# test_df = df.copy()
rst_dict = {keyword.lower(): score for keyword, score in rst}
df_keywords_with_scores = pd.DataFrame(list(rst_dict.items()), columns=['Keyword', 'Score'])
df_keywords_with_scores

Unnamed: 0,Keyword,Score
0,licensed,0.189815
1,insurance,0.148148
2,agent,0.159259
3,while,0.098148
4,many,0.189815
...,...,...
184714,process.actively,0.000000
184715,meetingsdevelopment,0.000000
184716,calendarswork,0.000000
184717,techniqueability,0.000000


In [15]:
# Create a dictionary from keyword_scores for efficient lookup
keyword_scores_dict = dict(zip(df_keywords_with_scores['Keyword'], df_keywords_with_scores['Score']))

# Calculate the total score for each bracket
total_scores = []
for bracket in tokenized_nostopw_corpus_lower:
    total_score = sum(keyword_scores_dict.get(word.lower(), 0) for word in bracket)
    total_scores.append(total_score)

# Create a new dataframe with brackets and total scores
df_with_total_scores = pd.DataFrame({'Data': tokenized_nostopw_corpus_lower, 'Total Score': total_scores})
# Sort the DataFrame by the "Total Score" column in ascending order
# df_with_total_scores_sorted = df_with_total_scores.sort_values(by='Total Score', ascending=False)

# Reset the index of the sorted DataFrame
# df_with_total_scores_sorted.reset_index(drop=True, inplace=True)
# df_with_total_scores_sorted
df_with_total_scores

Unnamed: 0,Data,Total Score
0,"[licensed, insurance, agent, while, many, indu...",24.596086
1,"[sales, manager, are, dynamic, creative, marke...",31.483183
2,"[model, risk, auditor, join, us, model, risk, ...",23.539180
3,"[business, manager, business, managerfirst, ba...",25.918365
4,"[ny, studio, assistant, you, could, be, one, o...",26.237192
...,...,...
15880,"[sanitation, technician, location, :, west, co...",31.474300
15881,"[unit, secretary, job, title, :, unit, secreta...",15.796584
15882,"[radiology, aide, ,, perdiem, job, title, :, r...",25.806056
15883,"[mri, manager, grade, 105, job, type, :, offic...",48.425984


In [16]:
score = pd.DataFrame(df_with_total_scores['Total Score'])
result = pd.concat([df['Data'], score], axis=1)
result = result.dropna()
# result
result = result.sort_values(by='Total Score', ascending=False)
resutl = result.reset_index(drop=True, inplace=True)
result

Unnamed: 0,Data,Total Score
0,"Communications Manager Please note, applicants...",227.778648
1,Research Assistant - PACT Study The Roskamp In...,211.381800
2,Registered Nurse 2 Discover Vanderbilt Univers...,203.682475
3,Eligibility Data Analyst # 3219 GRAIL healthca...,179.590307
4,IT Operations ServiceNow Admin Elastic free op...,171.259264
...,...,...
15879,"Customer Success Manager, Woopra About AppierA...",0.237570
15880,"Implementation Analyst (Hybrid, Libertyville I...",0.207126
15881,Salesforce Developer 𝐒𝐮𝐦𝐦𝐚𝐫𝐲 𝐨𝐟 𝐑𝐨𝐥𝐞 👤𝐑𝐨𝐥𝐞: 𝐒𝐚...,0.134007
15882,"Staff Accountant II, Ops. Accounting Staff Acc...",0.000000


In [17]:
# file_path = 'result.csv'
# output_df = result[:49].reset_index()
# output_df.to_csv(file_path, index=True)

In [23]:
base_result_df = pd.read_csv("resilient_investment_banker.csv")
base_result_df = base_result_df.sort_values(by='Vector', ascending=False)
base_result_df = base_result_df.reset_index(drop=True, inplace=False)
base_result_df

Unnamed: 0.1,Unnamed: 0,Data,Vector
0,1960,Investment Officer The County San Diego's Offi...,0.392322
1,13110,Investment Associate The Client: Was establish...,0.311891
2,13878,Investment Product Specialist Company Allsprin...,0.291462
3,13928,Investment Product Specialist Company Allsprin...,0.291462
4,14051,Investment Product Specialist Company Allsprin...,0.291462
...,...,...,...
15880,5409,Principal Software Developer When join Verizon...,0.000000
15881,5410,Retail Sales Associate When join Verizon Veriz...,0.000000
15882,5411,Territory Associate Job Responsibilities As Hi...,0.000000
15883,5412,Local Strategy & Analytics Manager - Coastal P...,0.000000


In [38]:
# Specify the column you want to compare
column_to_compare = 'Data'
cnt = 0

# Iterate through the rows and compare the specified column
for index, (row1, row2) in enumerate(zip(base_result_df[column_to_compare], result[column_to_compare])):
    if row1 == row2:
        cnt+=1
print("Accuracy:",str(round(((cnt/result.shape[0])*100),2))+"%")


Accuracy: 0.01%


### Calculate the accuracy for the second query

In [61]:
start_time = time.time()
rst2 = semantic_matching('full time senior head nurse position')
end_time = time.time()
# full_time_senior_head_nurse_position
rst_dict = {keyword.lower(): score for keyword, score in rst2}
df_keywords_with_scores = pd.DataFrame(list(rst_dict.items()), columns=['Keyword', 'Score'])
df_keywords_with_scores
print("Time for searching:", end_time-start_time)

Time for searching: 525.3357918262482


In [32]:
# Create a dictionary from keyword_scores for efficient lookup
keyword_scores_dict = dict(zip(df_keywords_with_scores['Keyword'], df_keywords_with_scores['Score']))

# Calculate the total score for each bracket
total_scores = []
for bracket in tokenized_nostopw_corpus_lower:
    total_score = sum(keyword_scores_dict.get(word.lower(), 0) for word in bracket)
    total_scores.append(total_score)

# Create a new dataframe with brackets and total scores
df_with_total_scores = pd.DataFrame({'Data': tokenized_nostopw_corpus_lower, 'Total Score': total_scores})
# Sort the DataFrame by the "Total Score" column in ascending order
# df_with_total_scores_sorted = df_with_total_scores.sort_values(by='Total Score', ascending=False)

# Reset the index of the sorted DataFrame
# df_with_total_scores_sorted.reset_index(drop=True, inplace=True)
# df_with_total_scores_sorted
df_with_total_scores

Unnamed: 0,Data,Total Score
0,"[licensed, insurance, agent, while, many, indu...",34.927444
1,"[sales, manager, are, dynamic, creative, marke...",43.723827
2,"[model, risk, auditor, join, us, model, risk, ...",32.663630
3,"[business, manager, business, managerfirst, ba...",35.830000
4,"[ny, studio, assistant, you, could, be, one, o...",36.906629
...,...,...
15880,"[sanitation, technician, location, :, west, co...",44.596450
15881,"[unit, secretary, job, title, :, unit, secreta...",23.461754
15882,"[radiology, aide, ,, perdiem, job, title, :, r...",36.547085
15883,"[mri, manager, grade, 105, job, type, :, offic...",69.676239


In [33]:
score = pd.DataFrame(df_with_total_scores['Total Score'])
result = pd.concat([df['Data'], score], axis=1)
result = result.dropna()
# result
result = result.sort_values(by='Total Score', ascending=False)
result = result.reset_index(drop=True, inplace=True)
result

Unnamed: 0,Data,Total Score
0,"Communications Manager Please note, applicants...",323.137097
1,Research Assistant - PACT Study The Roskamp In...,299.846823
2,Registered Nurse 2 Discover Vanderbilt Univers...,284.304506
3,Eligibility Data Analyst # 3219 GRAIL healthca...,255.015300
4,IT Operations ServiceNow Admin Elastic free op...,244.088231
...,...,...
15879,"Customer Success Manager, Woopra About AppierA...",0.295653
15880,"Implementation Analyst (Hybrid, Libertyville I...",0.227814
15881,Salesforce Developer 𝐒𝐮𝐦𝐦𝐚𝐫𝐲 𝐨𝐟 𝐑𝐨𝐥𝐞 👤𝐑𝐨𝐥𝐞: 𝐒𝐚...,0.150000
15882,"Staff Accountant II, Ops. Accounting Staff Acc...",0.000000


In [34]:
base_result_df = pd.read_csv("full_time_senior_head_nurse_position.csv")
base_result_df = base_result_df.sort_values(by='Vector', ascending=False)
base_result_df = base_result_df.reset_index(drop=True, inplace=False)
base_result_df

Unnamed: 0.1,Unnamed: 0,Data,Vector
0,8145,Full Time - Steward/Dishwasher Organization- T...,0.362844
1,11657,Registered Nurse (RN) - Full-Time 1st Shift Re...,0.356172
2,8002,Banquet Server (On-Call) Organization- Thompso...,0.348298
3,7910,PM Line Cook | Full-Time Organization- Thompso...,0.347987
4,11636,Licensed Practical Nurse - Full-Time 1st Shift...,0.322254
...,...,...,...
15880,1561,Fortinet SME Job:Fortinet SMEDuration: 1 yearL...,0.000000
15881,1559,Accounting Consultant Description Firm: Beyer ...,0.000000
15882,3779,"AEM Architect In role, play crucial role trans...",0.000000
15883,1555,"Project Manager Newmark Group, Inc. (Nasdaq: N...",0.000000


In [37]:
# Specify the column you want to compare
column_to_compare = 'Data'
cnt = 0

# Iterate through the rows and compare the specified column
for index, (row1, row2) in enumerate(zip(base_result_df[column_to_compare], result[column_to_compare])):
    if row1 == row2:
        cnt+=1
print("Accuracy:", str(round(((cnt/result.shape[0])*100),2))+"%")

Accuracy: 0.01%


## Spacial Analysis

In [58]:
import sys

memory_usage = df.memory_usage(deep=True).sum()
print(f"Memory usage of the Dataset in csv version: {memory_usage/1024/1024/1024} GB")
memory_usage = sys.getsizeof(inverted_index_processed)
print(f"Memory usage of the pre-processed model: {memory_usage/1024/1024} MB")
memory_usage = sys.getsizeof(rst)
print(f"Memory usage of the search result: {memory_usage/1024/1024} MB")

Memory usage of the Dataset in csv version: 13.41672064177692 GB
Memory usage of the pre-processed model: 10.000091552734375 MB
Memory usage of the search result: 1.5488204956054688 MB
