In [10]:
import os
import pandas as pd
from bs4 import BeautifulSoup as bs

In [22]:
data_dir = os.path.join(os.curdir, "data", "html_job_postings")
rows = []
for f in os.listdir(data_dir):
    file_path = os.path.join(os.curdir, "data", "html_job_postings", f)
    soup = bs(open(file_path))
    title = soup.find("title").text
    h1 = soup.find("h1")
    body = soup.find("body").text
    if(hasattr(h1,"text")):
        h1 = h1.text
    rows.append([f, title, h1, body])
df = pd.DataFrame(rows,columns=['file', 'title', 'h1', 'body'])

In [20]:
df

0       Quantitative Analyst - Boston, MA 02116\nQuant...
1       Data Scientist - Mountain View, CA\nGroundTrut...
2       Data Scientist - Seattle, WA\nA Bachelor or Ma...
3       Senior Natural Language Processing (NLP) Engin...
4       FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...
                              ...                        
1332    Data Scientist - Birmingham, AL 35233\nSUMMARY...
1333    Senior Data Scientist - Tempe, AZ\nCircle K is...
1334    New College Grad - Cybersecurity (Masters Degr...
1335    Decision Science Manager, Media Mix Modeling -...
1336    Data Scientist - Bellevue, WA\nExperienced in ...
Name: body, Length: 1337, dtype: object

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df[:].body.values)

In [24]:
print(tfidf_matrix)

  (0, 5781)	0.06617824094443775
  (0, 10106)	0.04475339330500462
  (0, 6502)	0.048846767493515
  (0, 771)	0.07943673605591134
  (0, 8677)	0.047470362851556626
  (0, 9573)	0.01728974818695696
  (0, 3350)	0.034702451951107056
  (0, 4031)	0.02709961326094388
  (0, 16192)	0.07943673605591134
  (0, 12004)	0.03477998638434775
  (0, 13124)	0.07943673605591134
  (0, 4270)	0.06047382386811823
  (0, 7668)	0.05313796805492837
  (0, 10211)	0.04039583890365342
  (0, 15274)	0.028637309996085002
  (0, 8916)	0.062403406343223014
  (0, 17795)	0.0540588445059981
  (0, 16175)	0.038802265691262626
  (0, 15305)	0.02709961326094388
  (0, 13868)	0.03319310940932198
  (0, 10009)	0.021193127722299473
  (0, 1361)	0.0464851494072064
  (0, 9040)	0.03721971111891175
  (0, 14271)	0.06476502442953161
  (0, 17877)	0.058842385131247885
  :	:
  (1336, 3871)	0.07662904023619449
  (1336, 18579)	0.03345299367535682
  (1336, 6415)	0.034759051201020276
  (1336, 8799)	0.040258271123026455
  (1336, 9807)	0.04132120612810756
 

In [27]:
tfidf_np_matrix = tfidf_matrix.toarray()
tfidf_np_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
num_posts, vocabulary_size = tfidf_np_matrix.shape
print(f"Our collection of {num_posts} job posts contain a total of "
      f"{vocabulary_size} unique words")

Our collection of 1337 job posts contain a total of 19003 unique words


In [34]:
import numpy as np
tfid_vector = tfidf_np_matrix[0]
tfid_vector

array([0., 0., 0., ..., 0., 0., 0.])

In [36]:
non_zero_indices = np.flatnonzero(tfid_vector)
non_zero_indices

array([   31,   771,  1165,  1258,  1361,  1457,  1566,  1807,  1826,
        1834,  1836,  1841,  1989,  2009,  2348,  2358,  2522,  2544,
        2567,  2651,  2881,  2945,  3067,  3168,  3170,  3292,  3314,
        3350,  3367,  3463,  3549,  3593,  3608,  3871,  3883,  3954,
        4031,  4084,  4101,  4119,  4180,  4193,  4243,  4270,  4282,
        4283,  4403,  4483,  4587,  4598,  4737,  4904,  4907,  4950,
        5003,  5012,  5168,  5206,  5279,  5330,  5332,  5338,  5345,
        5380,  5415,  5426,  5431,  5433,  5459,  5511,  5579,  5781,
        5799,  6054,  6055,  6415,  6502,  6570,  6691,  6728,  6828,
        6871,  6913,  6914,  7167,  7225,  7396,  7620,  7654,  7668,
        7886,  8068,  8099,  8677,  8779,  8799,  8804,  8857,  8860,
        8865,  8916,  9040,  9146,  9161,  9227,  9266,  9369,  9573,
        9735,  9760,  9807,  9913, 10009, 10014, 10106, 10107, 10162,
       10182, 10187, 10209, 10211, 10264, 10410, 10413, 10425, 10450,
       10491, 10579,

In [38]:
num_unique_words = non_zero_indices.size
num_unique_words

245

In [41]:
words = tfidf_vectorizer.get_feature_names()
words

['00',
 '000',
 '0000',
 '00007588',
 '0001',
 '0003',
 '00054471',
 '002',
 '0024',
 '0044137',
 '0082677',
 '0093',
 '00am',
 '00p',
 '00pm',
 '01',
 '010',
 '0105',
 '0111',
 '0132',
 '0150',
 '01730',
 '01810',
 '01843',
 '01844',
 '01880',
 '01m',
 '02',
 '0201',
 '02061',
 '02109',
 '02116',
 '02118',
 '02134',
 '02135',
 '02139',
 '02142',
 '02210',
 '02451',
 '02601',
 '02winter2020',
 '03',
 '0301',
 '04',
 '040',
 '045',
 '05',
 '05401',
 '0599u',
 '06',
 '0600',
 '06010',
 '0650',
 '06516',
 '06611',
 '06905',
 '07',
 '07302',
 '07311',
 '074',
 '07645',
 '077',
 '07950',
 '08',
 '0800',
 '0801',
 '08540',
 '09',
 '0911',
 '0923',
 '10',
 '100',
 '1000',
 '10001140',
 '10002817',
 '10004',
 '10007',
 '10010',
 '10011',
 '10012',
 '10013',
 '10017',
 '10018',
 '10020',
 '10022',
 '10025',
 '10036',
 '10038',
 '10045',
 '100k',
 '100m',
 '100mm',
 '100x',
 '1012',
 '10167',
 '10176',
 '10261',
 '10271',
 '10282',
 '1029232',
 '1031',
 '104',
 '1049033',
 '105',
 '1053460',
 '1

In [49]:
words[6001]

'eagle'

In [50]:
unique_words = [words[i] for i in non_zero_indices]


In [51]:
print(unique_words)

['02116', '620670', 'ability', 'accountable', 'adapt', 'advanced', 'agencies', 'analysis', 'analyst', 'analytical', 'analytics', 'analyze', 'appetite', 'apply', 'attrition', 'audiences', 'balance', 'bank', 'basel', 'behaviors', 'board', 'boston', 'broad', 'business', 'businesses', 'candidates', 'capital', 'careers', 'carlo', 'ccar', 'cfa', 'changes', 'characteristics', 'closely', 'cloudera', 'coding', 'com', 'committee', 'communication', 'company', 'complex', 'component', 'concepts', 'concurrently', 'conditions', 'conduct', 'construction', 'contributor', 'corporate', 'correction', 'create', 'custodial', 'customer', 'cycles', 'data', 'databases', 'deep', 'degree', 'demonstrated', 'deposit', 'deposits', 'depth', 'derivatives', 'desired', 'determine', 'developed', 'developing', 'development', 'dfast', 'different', 'directors', 'dni', 'documentation', 'econometrics', 'economic', 'ensure', 'eoe', 'error', 'eviews', 'excel', 'experience', 'experts', 'exposure', 'exposures', 'field', 'financi

In [53]:
data = {'Word': unique_words,
        'Count': tfidf_vector[non_zero_indices]}

In [55]:
df = pd.DataFrame(data).sort_values('Count', ascending=False)
print(df[:10].to_string(index=False))

         Word     Count
         bank  0.247258
      deposit  0.238507
       street  0.235370
 quantitative  0.209788
    liquidity  0.194295
          tqa  0.158873
         ccar  0.150291
     treasury  0.129530
   regulatory  0.125884
        sheet  0.124807


In [65]:
cosine_similarities = tfidf_np_matrix @ tfidf_np_matrix[0]
most_similar_index = np.argsort(cosine_similarities)
most_similar_index = most_similar_index[-2]
similarity = cosine_similarities[most_similar_index]
most_similar_post = rows[most_similar_index][3]
print(f"The following post has a cosine similarity of {similarity:.2f} "
       "with newsgroups.data[0]:\n")
print(most_similar_post)

The following post has a cosine similarity of 0.33 with newsgroups.data[0]:

Capital and Liquidity Management - Associate - New York, NY
Job Title: Capital and Liquidity Management
Corporate Title: Associate
Location: New York, NY
Overview
We are actively seeking an Associate professional for the New York office of Treasury, to join the Treasury Modeling and Analytics team. The primary function of this Treasury Modeling and Analytics team is to build models in support of key regulatory-facing activities: Capital Planning, Interest Rate Risk in the Banking Book (IRRBB), Liquidity management, and Liquidity Stress Testing. This particular role is for a contributing modeler who will participate in developing models and analytics for these important regulatory-facing responsibilities.
You will have the following responsibilities in this role:
Manage relationships with Business/Treasury/Finance model stakeholders. This involves leading meetings to facilitate important model-related decisions