In [31]:
import pandas as pd
import numpy as np
import re
import string
from ordered_set import OrderedSet

import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
def get_and_clean_data() -> pd.DataFrame:
    data = pd.read_csv("../../data/software_development_usa.csv")
    description = data["job_description"]
    cleaned_description = description.apply(
        lambda s: s.translate(str.maketrans("", "", string.punctuation + "\xa0"))
    )
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(
        lambda s: s.translate(
            str.maketrans(string.whitespace, " " * len(string.whitespace), "")
        )
    )
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

## Bag of Words

In [33]:
def create_stem_cache(cleaned_description:pd.DataFrame):
    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache

In [34]:
def create_custom_processor(stop_dict: dict, stem_cache: dict):
    def custom_processor(s: str):
        ps = PorterStemmer()
        s = re.sub(r"[^A-Za-z]", " ", s)
        s = re.sub(r"\s+", " ", s)
        s = word_tokenize(s)
        s = list(OrderedSet(s) - stop_dict)
        s = [word for word in s if len(word) > 2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = " ".join(s)
        return s
    return custom_processor

In [35]:
from sklearn.feature_extraction.text import CountVectorizer


def sk_vectorize(
    texts: list[str],
    cleaned_description: pd.DataFrame,
    stop_dict: dict,
    stem_cache: dict,
):
    my_custom_vectorizer = create_custom_processor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_vectorizer)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    print(vectorizer.inverse_transform(query))

In [36]:
cleaned_description = get_and_clean_data()
cleaned_description = cleaned_description.iloc[:1000]
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words("english"))
sk_vectorize(
    ["python is simpler than java", "java is simpler than python"],
    cleaned_description,
    stop_dict,
    stem_cache,
)

  (0, 4505)	1
  (0, 6824)	1
  (1, 4505)	1
  (1, 6824)	1
[array(['java', 'python'], dtype='<U82'), array(['java', 'python'], dtype='<U82')]


In [37]:
my_custom_preprocessor = create_custom_processor(stop_dict, stem_cache)
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1, 2))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))

98642


In [38]:
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1, 3))
trigram_vectorizer.fit(cleaned_description)
print(len(trigram_vectorizer.get_feature_names_out()))

225279


## TF-IDF

In [41]:
from scipy import sparse
from time import time

start = time()
vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
vectorizer.fit(cleaned_description)
X = vectorizer.transform(cleaned_description)
N = len(cleaned_description)

df = np.array((X.todense() > 0).sum(0))[0]
idf = np.log10(1 + (N / df))
tf = np.log10(X.todense() + 1)
tf_idf = np.multiply(tf, idf)

X = sparse.csr_matrix(tf_idf)
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
end = time()
elapsed = end - start
print("execution time: {} seconds".format(elapsed))
X_df[X_df.sum().sort_values()[-15:].index].iloc[:20]

execution time: 2.2326853275299072 seconds


Unnamed: 0,includ,manag,respons,use,team,provid,system,technolog,employ,test,design,requir,work,applic,develop
0,0.130591,0.214447,0.12609,0.199639,0.109688,0.137164,0.0,0.117337,0.143107,0.240925,0.171714,0.103624,0.097459,0.16493,0.183489
1,0.0,0.135301,0.12609,0.0,0.109688,0.0,0.0,0.185974,0.0,0.0,0.10834,0.103624,0.097459,0.104059,0.145412
2,0.0,0.0,0.0,0.0,0.109688,0.137164,0.124535,0.0,0.143107,0.190929,0.0,0.103624,0.154469,0.16493,0.145412
3,0.0,0.0,0.0,0.0,0.173851,0.137164,0.124535,0.0,0.0,0.120463,0.0,0.0,0.097459,0.0,0.145412
4,0.0,0.135301,0.12609,0.0,0.109688,0.0,0.0,0.185974,0.0,0.0,0.10834,0.103624,0.097459,0.104059,0.145412
5,0.0,0.0,0.12609,0.0,0.0,0.0,0.0,0.117337,0.226819,0.0,0.10834,0.103624,0.0,0.16493,0.145412
6,0.0,0.0,0.0,0.125958,0.173851,0.0,0.124535,0.117337,0.0,0.0,0.10834,0.103624,0.097459,0.104059,0.145412
7,0.0,0.135301,0.12609,0.199639,0.219375,0.0,0.124535,0.117337,0.0,0.190929,0.10834,0.103624,0.154469,0.104059,0.213025
8,0.130591,0.135301,0.12609,0.125958,0.109688,0.2174,0.197383,0.185974,0.226819,0.190929,0.10834,0.164241,0.154469,0.208118,0.183489
9,0.206981,0.0,0.12609,0.125958,0.173851,0.137164,0.124535,0.117337,0.0,0.120463,0.10834,0.103624,0.0,0.0,0.145412


## Activity

1. Show the X_df dataframe, showing with only the top 20 bigram terms with the highest sum of TF-IDF scores
    - Compare this with the computation time of unigram analysis.
        - unigram: `2.23 seconds`
        - bigram: `4.9 seconds`

In [42]:
start = time()
cleaned_description_small = cleaned_description.iloc[:]
vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1, 2))
vectorizer.fit(cleaned_description_small)
X = vectorizer.transform(cleaned_description_small)
N = len(cleaned_description_small)

df = np.array((X.todense() > 0).sum(0))[0]
idf = np.log10(1 + (N / df))
tf = np.log10(X.todense() + 1)
tf_idf = np.multiply(tf, idf)

X = sparse.csr_matrix(tf_idf)
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

end = time()
elapsed = end - start
print("execution time: {} seconds".format(elapsed))
X_df[X_df.sum().sort_values()[-15:].index].iloc[:20]

execution time: 4.907889127731323 seconds


Unnamed: 0,includ,manag,respons,use,team,provid,system,technolog,employ,test,design,requir,work,applic,develop
0,0.130591,0.214447,0.12609,0.199639,0.109688,0.137164,0.0,0.117337,0.143107,0.240925,0.171714,0.103624,0.097459,0.16493,0.183489
1,0.0,0.135301,0.12609,0.0,0.109688,0.0,0.0,0.185974,0.0,0.0,0.10834,0.103624,0.097459,0.104059,0.145412
2,0.0,0.0,0.0,0.0,0.109688,0.137164,0.124535,0.0,0.143107,0.190929,0.0,0.103624,0.154469,0.16493,0.145412
3,0.0,0.0,0.0,0.0,0.173851,0.137164,0.124535,0.0,0.0,0.120463,0.0,0.0,0.097459,0.0,0.145412
4,0.0,0.135301,0.12609,0.0,0.109688,0.0,0.0,0.185974,0.0,0.0,0.10834,0.103624,0.097459,0.104059,0.145412
5,0.0,0.0,0.12609,0.0,0.0,0.0,0.0,0.117337,0.226819,0.0,0.10834,0.103624,0.0,0.16493,0.145412
6,0.0,0.0,0.0,0.125958,0.173851,0.0,0.124535,0.117337,0.0,0.0,0.10834,0.103624,0.097459,0.104059,0.145412
7,0.0,0.135301,0.12609,0.199639,0.219375,0.0,0.124535,0.117337,0.0,0.190929,0.10834,0.103624,0.154469,0.104059,0.213025
8,0.130591,0.135301,0.12609,0.125958,0.109688,0.2174,0.197383,0.185974,0.226819,0.190929,0.10834,0.164241,0.154469,0.208118,0.183489
9,0.206981,0.0,0.12609,0.125958,0.173851,0.137164,0.124535,0.117337,0.0,0.120463,0.10834,0.103624,0.0,0.0,0.145412
