In [12]:
# Imports
import pandas as pd
import string
import requests
from bs4 import BeautifulSoup
import numpy as np
import pymc as pm

In [13]:
# Functions from hand on 1
def get_and_clean_data():
    data = pd.read_csv('../Week 1/resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

def simple_tokenize(data):
    cleaned_description = data.apply(lambda s: [x.strip() for x in s.split()])
    return cleaned_description

def parse_job_description():
    cleaned_description = get_and_clean_data()
    cleaned_description = simple_tokenize(cleaned_description)
    return cleaned_description

- Page 41 (Hand_out 1) : Indexer
---

In [16]:
str1 = 'the chosen software developer will be part of a larger engineering team developing software for medical devices.'
str2 = 'we are seeking a seasoned software developer with strong analytical and technical skills to join our public sector technology consulting team.'

import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Tokenize
tokened_str1 = word_tokenize(str1)
tokened_str2 = word_tokenize(str2)

# Remove low frequency
tokened_str1 = [w for w in tokened_str1 if len(w) > 2]
tokened_str2 = [w for w in tokened_str2 if len(w) > 2]

# Remove stop words
no_sw_str1 = [word for word in tokened_str1 if not word in stopwords.words()]
no_sw_str2 = [word for word in tokened_str2 if not word in stopwords.words()]

# Stemming
ps = PorterStemmer()
stemmed_str1 = np.unique([ps.stem(w) for w in no_sw_str1])
stemmed_str2 = np.unique([ps.stem(w) for w in no_sw_str2])

full_list = np.sort(np.concatenate([stemmed_str1, stemmed_str2]))
full_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


array(['analyt', 'chosen', 'consult', 'develop', 'develop', 'devic',
       'engin', 'join', 'larger', 'medic', 'part', 'public', 'season',
       'sector', 'seek', 'skill', 'softwar', 'softwar', 'strong', 'team',
       'team', 'technic', 'technolog'], dtype='<U9')

- Page 2, 3 (Hand_out 2) : **Fun with couputational stats**
    - Rolling six dice and estimate the probability of not getting exactly one specific outcome <br>`(e.g., not getting a single 1)` over n trials.
---

In [14]:
# page 2
k = 6
n = 8
n_simulations = 100000

dice_rolls = np.random.randint(1, 7, size=(n_simulations, k * n))
has_1 = (dice_rolls==1).any(axis=1)
non_single_1 = ~has_1
prob_non_single_one = np.mean(non_single_1)

print(f"Simulated probability of non-single '1's: {prob_non_single_one:.5f}")

Simulated probability of non-single '1's: 0.00016


In [21]:
# page 3 : PYMC

k = 6
n = 8

n_simulations = 10000
with pm.Model() as model:
    total_ones = pm.Binomial('total_ones', n=k * n, p=1/6, shape=n_simulations)
    prior = pm.sample_prior_predictive()

total_ones_results = prior.prior['total_ones']
no_ones_results = (total_ones_results == 0)
prob_no_ones = no_ones_results.mean()

print(f"Simulated probability of non-single '1's: {prob_no_ones:.5f}")


Sampling: [total_ones]


Simulated probability of non-single '1's: 0.00015


- Page 18 (Hand_out 2) : Create search_or()
---

In [17]:
def inverse_indexing(parsed_description):
    sw_set = set(stopwords.words()) - {'c'}
    no_sw_description = parsed_description.apply(lambda x: [w for w in x if w not in sw_set])
    ps = PorterStemmer()
    # Preprocess: remove stop words and then stem
    stemmed_description = no_sw_description.apply(lambda x: set([ps.stem(w) for w in x]))
    all_unique_term = list(set.union(*stemmed_description.to_list()))
    invert_idx = {}
    for s in all_unique_term:
        # Create the inverted index
        invert_idx[s] = set(stemmed_description.loc[stemmed_description.apply(lambda x: s in x)].index)

    return invert_idx

def search_and(invert_idx, query):
    ps = PorterStemmer()
    # Preprocess query
    processed_query = [s.lower() for s in query.split()]
    stemmed = [ps.stem(s) for s in processed_query]
    # Intersect means and
    matched = list(set.intersection(*[invert_idx[s] for s in stemmed]))
    return matched

def search_or(invert_idx, query):
    ps = PorterStemmer()
    # Preprocess query
    processed_query = [s.lower() for s in query.split()]
    stemmed = [ps.stem(s) for s in processed_query]
    # Intersect means or
    matched = list(set.union(*[invert_idx[s] for s in stemmed]))
    return matched

if __name__ == '__main__':
    parsed_description = parse_job_description()
    invert_idx = inverse_indexing(parsed_description)
    query = 'java oracle'
    # Search using index will perform fast
    matched_and = search_and(invert_idx, query)
    matched_or = search_or(invert_idx, query)


In [18]:
print(parsed_description.loc[matched_and].apply(lambda x: ' '.join(x)).head().to_markdown())

|      | job_description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [19]:
print(parsed_description.loc[matched_or].apply(lambda x: ' '.join(x)).head().to_markdown())

|      | job_description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [20]:
print(f'matched_and = {len(matched_and)}') # expect 914
print(f'matched_or = {len(matched_or)}') # expect 3747

matched_and = 914
matched_or = 3747
