# Calculate Cosine Similarity Score (KSA, JD_usajobs)

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

JD = pd.read_csv('D:\\study\\Gmu\\Sem-4\\690\\Data\\JD_usajobs.csv')


KSA = pd.read_excel('D:\\study\\Gmu\\Sem-4\\690\\Data\\KSA.xlsx')

In [3]:
JD

Unnamed: 0,Title,Location,Organization,Department,QualificationSummary
0,Director (Cyber Policy and Programs),"Washington, District of Columbia",Office of the National Cyber Director,Executive Office of the President,In order to qualify for the Director (Cyber Po...
1,CYBER OPERATIONS PLANS ANALYST,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,"In order to qualify, you must meet the special..."
2,SUPERVISORY CYBER POLICY-PLANS AND PROGRAMS,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,This is a GG-14 position in the Cyber Excepted...
3,Information Technology (Cybersecurity),Anywhere in the U.S. (remote job),Department of Energy - Agency Wide,Department of Energy,You must meet the Basic Requirement and the Sp...
4,PROGRAM ANALYST,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,"In order to qualify, you must meet the special..."
...,...,...,...,...,...
478,Supervisory Biomedical Engineer (Clinical),Multiple Locations,Veterans Health Administration,Department of Veterans Affairs,Applicants pending the completion of education...
479,Diagnostic Radiologic Technologist - Intervent...,"Bay Pines, Florida",Veterans Health Administration,Department of Veterans Affairs,Applicants pending the completion of education...
480,Supervisory IT Specialist (Deputy Director),"Washington, District of Columbia",Commodity Futures Trading Commission,Other Agencies and Independent Organizations,"To qualify for the CT-16, you must possess at ..."
481,Biomedical Equipment Specialist (BESS),"Cincinnati, Ohio",Veterans Health Administration,Department of Veterans Affairs,Applicants pending the completion of education...


In [4]:
KSA

Unnamed: 0,OPM Code,NICE Specialty Area,NICE Specialty Area Description,Work Role,Work Role Description,KSA ID,KSA Description
0,511,Cybersecurity Defense Analysis (CDA),Uses defensive measures and information collec...,Cyber Defense Analyst,Uses data collected from a variety of cyber de...,A0010,Ability to analyze malware.
1,461,Systems Analysis (ANA),Studies an organization's current computer sys...,Systems Security Analyst,Responsible for the analysis and development o...,A0015,Ability to conduct vulnerability scans and rec...
2,511,Cybersecurity Defense Analysis (CDA),Uses defensive measures and information collec...,Cyber Defense Analyst,Uses data collected from a variety of cyber de...,A0066,Ability to accurately and completely source al...
3,531,Incident Response (CIR),Responds to crises or urgent situations within...,Cyber Defense Incident Responder,"Investigates, analyzes, and responds to cyber ...",A0121,Ability to design incident response for cloud ...
4,461,Systems Analysis (ANA),Studies an organization's current computer sys...,Systems Security Analyst,Responsible for the analysis and development o...,A0123,Ability to apply cybersecurity and privacy pri...
...,...,...,...,...,...,...,...
124,511,Cybersecurity Defense Analysis (CDA),Uses defensive measures and information collec...,Cyber Defense Analyst,Uses data collected from a variety of cyber de...,S0169,Skill in conducting trend analysis.
125,531,Incident Response (CIR),Responds to crises or urgent situations within...,Cyber Defense Incident Responder,"Investigates, analyzes, and responds to cyber ...",S0173,Skill in using security event correlation tools.
126,531,Incident Response (CIR),Responds to crises or urgent situations within...,Cyber Defense Incident Responder,"Investigates, analyzes, and responds to cyber ...",S0365,Skill to design incident response for cloud se...
127,461,Systems Analysis (ANA),Studies an organization's current computer sys...,Systems Security Analyst,Responsible for the analysis and development o...,S0367,Skill to apply cybersecurity and privacy princ...


In [5]:

def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Stem the words
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text



JD['QualificationSummary'] = JD['QualificationSummary'].apply(preprocess_text)
KSA['KSA Description'] = KSA['KSA Description'].apply(preprocess_text)

In [6]:
JD['QualificationSummary']

0      order qualifi director cyber polici program po...
1      order qualifi must meet special experi requir ...
2      gg14 posit cyber except servic ce gg14 duti pr...
3      must meet basic requir special experi qualifi ...
4      order qualifi must meet special experi requir ...
                             ...                        
478    applic pend complet educ certificationlicensur...
479    applic pend complet educ certificationlicensur...
480    qualifi ct16 must posse least one year special...
481    applic pend complet educ certificationlicensur...
482    applic pend complet educ certificationlicensur...
Name: QualificationSummary, Length: 483, dtype: object

In [8]:
KSA['KSA Description']

0                                     abil analyz malwar
1      abil conduct vulner scan recogn vulner secur s...
2      abil accur complet sourc data use intellig ass...
3           abil design incid respons cloud servic model
4      abil appli cybersecur privaci principl organiz...
                             ...                        
124                          skill conduct trend analysi
125                    skill use secur event correl tool
126        skill design incid respons cloud servic model
127    skill appli cybersecur privaci principl organi...
128    skill use cyber defens servic provid report st...
Name: KSA Description, Length: 129, dtype: object

In [9]:

KSA_string = ' '.join(KSA['KSA Description'])


KSA_df = pd.DataFrame({'KSA': KSA_string}, index=[0])


KSA_df

Unnamed: 0,KSA
0,abil analyz malwar abil conduct vulner scan re...


In [10]:

concatenated_text = pd.concat([JD['QualificationSummary'], KSA_df['KSA']], ignore_index=True)


concatenated_text

0      order qualifi director cyber polici program po...
1      order qualifi must meet special experi requir ...
2      gg14 posit cyber except servic ce gg14 duti pr...
3      must meet basic requir special experi qualifi ...
4      order qualifi must meet special experi requir ...
                             ...                        
479    applic pend complet educ certificationlicensur...
480    qualifi ct16 must posse least one year special...
481    applic pend complet educ certificationlicensur...
482    applic pend complet educ certificationlicensur...
483    abil analyz malwar abil conduct vulner scan re...
Length: 484, dtype: object

## Approach 1. use CountVectorizer

In [11]:

ctvectorizer = CountVectorizer()


ct_matrix = ctvectorizer.fit_transform(concatenated_text)


ct_matrix

<484x4324 sparse matrix of type '<class 'numpy.int64'>'
	with 89976 stored elements in Compressed Sparse Row format>

In [12]:

cosine_ct = cosine_similarity(ct_matrix[:-1], ct_matrix[-1])



JD['cos_KSA_countvect'] = cosine_ct[:,0]

In [22]:

JD['cos_KSA_countvect']=JD['cos_KSA_countvect']*100

In [23]:

print(JD['cos_KSA_countvect'].describe())

count    483.000000
mean      19.458825
std        9.382529
min        0.000000
25%       13.562579
50%       17.651284
75%       24.710630
max       51.419322
Name: cos_KSA_countvect, dtype: float64


## Approach 2. use TF-IDF vectorizer

In [24]:

tfvectorizer = TfidfVectorizer()


tfidf_matrix = tfvectorizer.fit_transform(concatenated_text)


cosine_tfidf = cosine_similarity(tfidf_matrix[:-1], tfidf_matrix[-1])



JD['cos_KSA_tfidf'] = cosine_tfidf[:,0]

In [25]:

JD['cos_KSA_tfidf']=JD['cos_KSA_tfidf']*100

In [26]:

print(JD['cos_KSA_tfidf'].describe())

count    483.000000
mean      12.463967
std        6.228819
min        0.000000
25%        8.250293
50%       11.269752
75%       16.426129
max       36.063722
Name: cos_KSA_tfidf, dtype: float64


In [27]:
JD

Unnamed: 0,Title,Location,Organization,Department,QualificationSummary,cos_KSA_countvect,cos_KSA_tfidf
0,Director (Cyber Policy and Programs),"Washington, District of Columbia",Office of the National Cyber Director,Executive Office of the President,order qualifi director cyber polici program po...,16.865809,15.881155
1,CYBER OPERATIONS PLANS ANALYST,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,order qualifi must meet special experi requir ...,29.378216,16.372615
2,SUPERVISORY CYBER POLICY-PLANS AND PROGRAMS,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,gg14 posit cyber except servic ce gg14 duti pr...,32.714291,18.452627
3,Information Technology (Cybersecurity),Anywhere in the U.S. (remote job),Department of Energy - Agency Wide,Department of Energy,must meet basic requir special experi qualifi ...,21.614236,18.228973
4,PROGRAM ANALYST,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,order qualifi must meet special experi requir ...,43.152110,22.450201
...,...,...,...,...,...,...,...
478,Supervisory Biomedical Engineer (Clinical),Multiple Locations,Veterans Health Administration,Department of Veterans Affairs,applic pend complet educ certificationlicensur...,13.836965,8.101932
479,Diagnostic Radiologic Technologist - Intervent...,"Bay Pines, Florida",Veterans Health Administration,Department of Veterans Affairs,applic pend complet educ certificationlicensur...,24.031446,10.096924
480,Supervisory IT Specialist (Deputy Director),"Washington, District of Columbia",Commodity Futures Trading Commission,Other Agencies and Independent Organizations,qualifi ct16 must posse least one year special...,27.591750,17.679071
481,Biomedical Equipment Specialist (BESS),"Cincinnati, Ohio",Veterans Health Administration,Department of Veterans Affairs,applic pend complet educ certificationlicensur...,17.427543,10.313489


In [31]:
JD[(JD['cos_KSA_countvect'] > 10) ]

Unnamed: 0,Title,Location,Organization,Department,QualificationSummary,cos_KSA_countvect,cos_KSA_tfidf
0,Director (Cyber Policy and Programs),"Washington, District of Columbia",Office of the National Cyber Director,Executive Office of the President,order qualifi director cyber polici program po...,16.865809,15.881155
1,CYBER OPERATIONS PLANS ANALYST,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,order qualifi must meet special experi requir ...,29.378216,16.372615
2,SUPERVISORY CYBER POLICY-PLANS AND PROGRAMS,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,gg14 posit cyber except servic ce gg14 duti pr...,32.714291,18.452627
3,Information Technology (Cybersecurity),Anywhere in the U.S. (remote job),Department of Energy - Agency Wide,Department of Energy,must meet basic requir special experi qualifi ...,21.614236,18.228973
4,PROGRAM ANALYST,"Fort Meade, Maryland",AFELEM US CYBER COMMAND,Department of the Air Force,order qualifi must meet special experi requir ...,43.152110,22.450201
...,...,...,...,...,...,...,...
478,Supervisory Biomedical Engineer (Clinical),Multiple Locations,Veterans Health Administration,Department of Veterans Affairs,applic pend complet educ certificationlicensur...,13.836965,8.101932
479,Diagnostic Radiologic Technologist - Intervent...,"Bay Pines, Florida",Veterans Health Administration,Department of Veterans Affairs,applic pend complet educ certificationlicensur...,24.031446,10.096924
480,Supervisory IT Specialist (Deputy Director),"Washington, District of Columbia",Commodity Futures Trading Commission,Other Agencies and Independent Organizations,qualifi ct16 must posse least one year special...,27.591750,17.679071
481,Biomedical Equipment Specialist (BESS),"Cincinnati, Ohio",Veterans Health Administration,Department of Veterans Affairs,applic pend complet educ certificationlicensur...,17.427543,10.313489


In [41]:
J1 = JD['Location'].str.split(",", expand = True)

In [42]:
J1.columns=["place","State","Nan"]

In [43]:
JD=pd.concat([JD,J1],axis=1)

In [46]:
JD=JD.drop(columns=['Location','Nan'])

In [48]:
JD.to_csv("D:\\study\\Gmu\\Sem-4\\690\\Data\\JD.csv")