In [None]:
def get_and_clean_data():
    data = pd.read_csv('../Week 1/resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

def simple_tokenize(data):
    cleaned_description = data.apply(lambda s: [x.strip() for x in s.split()])
    return cleaned_description

def parse_job_description():
    cleaned_description = get_and_clean_data()
    cleaned_description = simple_tokenize(cleaned_description)
    return cleaned_description

---
**Page 58, 59** : Distributed indexing

- Fault tolerant matter

    - If in a non-fault-tolerant system with 1000 nodes, each node has 99.9% uptime,
    what is the uptime of the entire system?
        - 0.9991000 ≈ 0.368 which is around 36.8%

- This result means the system has an uptime of approximately 36.8%, indicating
that 63.2% of the time, one or more nodes will be down.

In [1]:
import pymc as pm
import numpy as np

num_nodes = 500
node_uptime = 0.99
num_simulations = 5000

with pm.Model() as model:
    node_states = pm.Bernoulli('node_states', p=node_uptime, size=(num_simulations, num_nodes))
    system_uptime = pm.math.prod(node_states, axis=1)
    mean_system_uptime = pm.Deterministic('mean_system_uptime', pm.math.mean(system_uptime))
    prior_checks = pm.sample_prior_predictive()

node_states_samples = prior_checks.prior['node_states'].values
node_states_samples = np.squeeze(node_states_samples)
nodes_up_count = np.sum(node_states_samples, axis=1)

system_uptime_samples = prior_checks.prior['mean_system_uptime'].values
overall_mean_system_uptime = np.mean(system_uptime_samples)

Sampling: [node_states]


In [2]:
print("Overall Mean System Uptime:", overall_mean_system_uptime)

Overall Mean System Uptime: 0.006555600000000001


---
**Page 67, 69** : Compression for sparse matrices

In [3]:
# dense to sparse
from numpy import array
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, dok_matrix, lil_matrix
# create dense matrix
A = array([[1, 0, 0, 1, 0, 0], [0, 0, 2, 0, 0, 1], [0, 0, 0, 2, 0, 0]])
print(A)

# convert to sparse matrix (COO method)
S = coo_matrix(A)
print(S)

print(S.tocsr()[:,3])

# reconstruct dense matrix
B = S.todense()
print(B)


[[1 0 0 1 0 0]
 [0 0 2 0 0 1]
 [0 0 0 2 0 0]]
  (0, 0)	1
  (0, 3)	1
  (1, 2)	2
  (1, 5)	1
  (2, 3)	2
  (0, 0)	1
  (2, 0)	2
[[1 0 0 1 0 0]
 [0 0 2 0 0 1]
 [0 0 0 2 0 0]]


In [4]:
import timeit
times = 100000
timeit.timeit(lambda : dok_matrix(B), number=times)/times
timeit.timeit(lambda : lil_matrix(B), number=times)/times
timeit.timeit(lambda : csr_matrix(B), number=times)/times
timeit.timeit(lambda : csc_matrix(B), number=times)/times

0.00013275628899922593

---

In [25]:
from ordered_set import OrderedSet
import pandas as pd
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


#limit to just 1000 rows
cleaned_description = get_and_clean_data()[:1000]

#replace non alphabets with spaces, and collapse spaces
cleaned_description = cleaned_description.apply(lambda s: re.sub(r'[^A-Za-z]', ' ', s))
cleaned_description = cleaned_description.apply(lambda s: re.sub(r'\s+', ' ', s))

#tokenize
tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))

#remove stop words
stop_dict = set(stopwords.words())
sw_removed_description = tokenized_description.apply(lambda s: list(OrderedSet(s) - stop_dict))
sw_removed_description = sw_removed_description.apply(lambda s: [word for word in s if len(word)>2])

#create stem caches
concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
stem_cache = {}
ps = PorterStemmer()
for s in concated:
    stem_cache[s] = ps.stem(s)

#stem
stemmed_description = sw_removed_description.apply(lambda s: [stem_cache[w] for w in s])
stemmed_description

0       [chosen, softwar, develop, part, larger, engin...
1       [posit, lead, softwar, develop, locat, middlet...
2       [senior, softwar, develop, hoboken, start, mon...
3       [client, multin, publish, educ, compani, seek,...
4       [posit, lead, softwar, develop, locat, philade...
                              ...                        
1230    [job, summari, softwar, develop, rubi, rail, c...
1231    [globalstar, seek, softwar, develop, join, tea...
1232    [softwar, engin, servic, rockstar, level, expe...
1233    [job, titl, lead, softwar, developertechn, net...
1234    [ref, classif, softwar, engin, compens, year, ...
Name: job_description, Length: 1000, dtype: object