In [2]:
# Imports
import matplotlib.pyplot as plt
import multiprocessing
import pandas as pd
import numpy as np
import string
import timeit
import time
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocessing import Pool
from nltk.tokenize import word_tokenize
from ordered_set import OrderedSet
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy import sparse

In [3]:
# Functions from hand on 1
def get_and_clean_data():
    data = pd.read_csv('../Week 1/resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

def simple_tokenize(data):
    cleaned_description = data.apply(lambda s: [x.strip() for x in s.split()])
    return cleaned_description

def parse_job_description():
    cleaned_description = get_and_clean_data()
    cleaned_description = simple_tokenize(cleaned_description)
    return cleaned_description

In [4]:
# Page 11 : 
def create_stem_cache(cleaned_description):
    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache

def create_custom_preprocessor(stop_dict, stem_cache):
    def custom_preprocessor(s):
        ps = PorterStemmer()
        s = re.sub(r'[^A-Za-z]', ' ', s)
        s = re.sub(r'\s+', ' ', s)
        s = word_tokenize(s)
        s = [ss for ss in s if ss not in stop_dict]
        s = [word for word in s if len(word)>2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = ' '.join(s)
        return s
    return custom_preprocessor

In [5]:
# Page 12 :
def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    print(vectorizer.inverse_transform(query))
    
    return vectorizer

In [6]:
cleaned_description = get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English'))
vectorizer = sk_vectorize(['python is simpler than java'], cleaned_description, stop_dict, stem_cache)

  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
[array(['java', 'python', 'simpler'], dtype='<U124')]


---
- Page 14, 16 (Hand_out 3): A Scikit-learn process

In [7]:
# Page 14
vectorizer = sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict, stem_cache)

  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
  (1, 13947)	1
  (1, 21383)	1
  (1, 24234)	1
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'python', 'simpler'], dtype='<U124')]


In [8]:
# Page 16
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1, 3))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))

1215823


---
Page 24

In [9]:
X = vectorizer.transform(cleaned_description)
N = len(cleaned_description)

df = np.array((X.todense()>0).sum(0))[0]
idf = np.log10(1+(N / df))
tf = np.log10(X.todense()+1)

tf_idf = np.multiply(tf, idf)

X = sparse.csr_matrix(tf_idf)
print(X.toarray())

X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

max_term = X_df.sum().sort_values()[-20:].sort_index().index
print(X_df[max_term].to_markdown())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
|      |   applic |     busi |   design |   develop |    experi |    manag |   requir |   servic |    skill |   softwar |    solut |   support |   system |     team |   technic |   technolog |     test |      use |      work |     year |
|-----:|---------:|---------:|---------:|----------:|----------:|---------:|---------:|---------:|---------:|----------:|---------:|----------:|---------:|---------:|----------:|------------:|---------:|---------:|----------:|---------:|
|    0 | 0.238512 | 0        | 0.276343 | 0.338845  | 0.374309  | 0.202949 | 0.105262 | 0.125198 | 0.228192 | 0.327406  | 0.122174 |  0        | 0        | 0.105974 |  0.210291 |    0.113112 | 0.35548  | 0.189653 | 0.193414  | 0.277832 |
|    1 | 0.205443 | 0        | 0.106904 | 0.304185  | 0.222458  | 0.128046 | 0.210524 | 0.125198 | 0.114096 | 0.150003  | 0.122174

(Extra)

In [10]:
X_df['java']

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
7578    0.156726
7579    0.313451
7580    0.000000
7581    0.156726
7582    0.248404
Name: java, Length: 7583, dtype: float64

In [11]:
def preprocess_tfidf_vectorizer(input_df, stop_dict, stem_cache):
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = TfidfVectorizer(preprocessor=my_custom_preprocessor)
    vectorizer.fit(input_df)
    return vectorizer

In [14]:
tfidf_vectorizer = preprocess_tfidf_vectorizer(cleaned_description, stop_dict, stem_cache)
tfidf_df = pd.DataFrame(tfidf_vectorizer.transform(cleaned_description).toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [15]:
tfidf_df['java']

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
7578    0.019612
7579    0.102381
7580    0.000000
7581    0.029700
7582    0.035206
Name: java, Length: 7583, dtype: float64

# HAND ON (3 of 3) START HERE <<<<<

---
Page 39 - 42

In [16]:
# Page 39 : Let’s simplify by ignoring idf
arr = np.array([[100, 200, 200, 50], [90, 200, 300, 0], [5, 200, 10, 200]])

data = pd.DataFrame(arr, index=['DevOpsHandbook', 'ContinuousDelivery', 'DistributedComputing'], columns=['business', 'computer', 'git', 'parallel'])
data = np.log10(data + 1)
print(data.loc['DevOpsHandbook'].dot(data.loc['ContinuousDelivery']))
print(data.loc['DevOpsHandbook'].dot(data.loc['DistributedComputing']))
print(data.loc['ContinuousDelivery'].dot(data.loc['DistributedComputing']))

14.939885194377618
13.195777686137449
9.410303606094942


In [17]:
# Page 40 : Length normalization
data.loc['DevOpsHandbook'] /= np.sqrt((data.loc['DevOpsHandbook'] ** 2).sum())
data.loc['ContinuousDelivery'] /= np.sqrt((data.loc['ContinuousDelivery'] ** 2).sum())
data.loc['DistributedComputing'] /= np.sqrt((data.loc['DistributedComputing'] ** 2).sum())
print(data.to_markdown())

|                      |   business |   computer |      git |   parallel |
|:---------------------|-----------:|-----------:|---------:|-----------:|
| DevOpsHandbook       |   0.478543 |   0.549901 | 0.549901 |   0.407692 |
| ContinuousDelivery   |   0.501071 |   0.589096 | 0.633951 |   0        |
| DistributedComputing |   0.221882 |   0.656732 | 0.296942 |   0.656732 |


In [18]:
# Page 41 : Dot product
print(data.loc['DevOpsHandbook'].dot(data.loc['ContinuousDelivery']))
print(data.loc['DevOpsHandbook'].dot(data.loc['DistributedComputing']))
print(data.loc['ContinuousDelivery'].dot(data.loc['DistributedComputing']))

0.9123394651809296
0.8983513789958276
0.6863034317623423


In [64]:
# Page 42 : Scikit-learn’s builtin tf-idf
tf_idf_vectorizer = TfidfVectorizer(preprocessor=my_custom_preprocessor, use_idf=True)
tf_idf_vectorizer.fit(cleaned_description)
transformed_data = tf_idf_vectorizer.transform(cleaned_description)
X_tfidf_df = pd.DataFrame(transformed_data.toarray(), columns=tf_idf_vectorizer.get_feature_names_out())
max_term = X_tfidf_df.sum().sort_values()[-10:].sort_index().index
X_tfidf_df[max_term].head(5)


Unnamed: 0,applic,design,develop,experi,requir,softwar,system,team,test,work
0,0.064973,0.085218,0.167965,0.208201,0.016732,0.146336,0.0,0.016868,0.134132,0.045161
1,0.080861,0.028282,0.209038,0.098709,0.083293,0.048565,0.0,0.02799,0.0,0.02498
2,0.064598,0.0,0.03711,0.118286,0.02218,0.038798,0.024428,0.02236,0.050803,0.079823
3,0.0,0.0,0.153101,0.081332,0.0,0.080032,0.033593,0.061499,0.034932,0.027443
4,0.082715,0.02893,0.213832,0.100973,0.085203,0.049679,0.0,0.028631,0.0,0.025552


---
- Page 43 of Handout #3
    - Compare the differences between `CountVectorizer` and `TfidfVectorizer`

In [84]:
# Similarity Computation for Document Relevance (TF-IDF)
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity for TF-IDF
tf_idf_data = transformed_data
similarities_tfidf = cosine_similarity(tf_idf_data[0], tf_idf_data)

# Get top 5 most relevant documents to row #0 (excluding itself)
doc_similarities = similarities_tfidf[0]  # Row #0 similarity with all documents
top_5_docs = doc_similarities.argsort()[-6:-1][::-1]  # Exclude itself and sort
print(f"Top 5 documents most similar to row #0 using TF-IDF: {top_5_docs}")

# Query Matching (aws devops)
# Transform the query into the TF-IDF vector space
query_vector = vectorizer.transform(["aws devops"])
query_similarities = cosine_similarity(query_vector, tf_idf_data).flatten()

# Get top 5 most relevant documents for the query
top_5_query_docs = query_similarities.argsort()[-5:][::-1]
print(f"Top 5 documents most relevant to query 'aws devops' using TF-IDF: {top_5_query_docs}")


Top 5 documents most similar to row #0 using TF-IDF: [4214 2343 2387 4362 4188]
Top 5 documents most relevant to query 'aws devops' using TF-IDF: [4558 4561 7426 3116 6156]


In [87]:
# Vectorize the cleaned data using CountVectorizer
count_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
X_count = count_vectorizer.fit_transform(cleaned_description)

# Document similarity for CountVectorizer
similarities_count = cosine_similarity(X_count[0], X_count)
doc_similarities_count = similarities_count[0]
top_5_docs_count = doc_similarities_count.argsort()[-6:-1][::-1]
print(f"Top 5 documents most similar to row #0 using CountVectorizer: {top_5_docs_count}")

# Query similarity for CountVectorizer
query_vector_count = count_vectorizer.transform(["aws devops"])
query_similarities_count = cosine_similarity(query_vector_count, X_count).flatten()
top_5_query_docs_count = query_similarities_count.argsort()[-5:][::-1]
print(f"Top 5 documents most relevant to query 'aws devops' using CountVectorizer: {top_5_query_docs_count}")


Top 5 documents most similar to row #0 using CountVectorizer: [4362 4188 4368 2387 3713]
Top 5 documents most relevant to query 'aws devops' using CountVectorizer: [7426 3116 6156 6163 4561]


In [86]:
# Comparing the results (prints)
print("Comparison of top 5 documents for row #0:")
print(f"TF-IDF: {top_5_docs}")
print(f"CountVectorizer: {top_5_docs_count}")
print(f"Done in 0.1s average\n")

print("Comparison of top 5 documents for the query 'aws devops':")
print(f"TF-IDF: {top_5_query_docs}")
print(f"CountVectorizer: {top_5_query_docs_count}")
print(f"Done in 30s average\n")


Comparison of top 5 documents for row #0:
TF-IDF: [4214 2343 2387 4362 4188]
CountVectorizer: [4362 4188 4368 2387 3713]
Done in 0.1s average

Comparison of top 5 documents for the query 'aws devops':
TF-IDF: [4558 4561 7426 3116 6156]
CountVectorizer: [7426 3116 6156 6163 4561]
Done in 30s average



---
- Page 86 and 87 of Handout #3

In [68]:
# page 87
class BM25(object):
    def __init__(self, vectorizer, b=0.75, k1=1.6):
        self.vectorizer = vectorizer
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        self.y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = self.y.sum(1).mean()

    def transform(self, q):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        len_y = self.y.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        y = self.y.tocsc()[:, q.indices]
        denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = y.multiply(np.broadcast_to(idf, y.shape)) * (k1 + 1)
        return (numer / denom).sum(1).A1

In [69]:
# page 88
cleaned_description = get_and_clean_data()
bm25 = BM25(tf_idf_vectorizer)
bm25.fit(cleaned_description)
score = bm25.transform('aws devops')
rank = np.argsort(score)[::-1]
print(cleaned_description.iloc[rank[:5]].to_markdown())
score = bm25.transform('aws github')
rank = np.argsort(score)[::-1]
print(cleaned_description.iloc[rank[:5]].to_markdown())


|      | job_description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                