# Assignment of Module 2

### Import essential dependency

In [1]:
import pandas as pd
import string

import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tkthanatorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### load and clean dataset function

In [2]:
def get_and_clean_data() -> pd.DataFrame:
    data = pd.read_csv("../../data/software_development_usa.csv")
    description = data["job_description"]
    cleaned_description = description.apply(
        lambda s: s.translate(str.maketrans("", "", string.punctuation + "\xa0"))
    )
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(
        lambda s: s.translate(
            str.maketrans(string.whitespace, " " * len(string.whitespace), "")
        )
    )
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [3]:
cleaned_description = get_and_clean_data()
cleaned_description = cleaned_description
cleaned_description

0       the chosen sr software developer will be part ...
1       position c lead software developer location mi...
2       senior software developer hoboken nj starts as...
3       our client a multinational publishing and educ...
4       position c lead software developer location ph...
                              ...                        
9991    position description  position description  cg...
9994    job description  researches designs develops a...
9997    job description  the candidate must be experie...
9998    please only apply if you do not need sponsorsh...
9999    company information  solid reputation passiona...
Name: job_description, Length: 7583, dtype: object

### Tokenize and remove stop words from datasets

In [4]:
tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
stop_set = set(stopwords.words())

sw_removed_description = tokenized_description.apply(
    lambda s: [word for word in s if word not in stop_set]
)

sw_removed_description = sw_removed_description.apply(
    lambda s: [word for word in s if len(word) > 2]
)

sw_removed_description

0       [chosen, software, developer, part, larger, en...
1       [position, lead, software, developer, location...
2       [senior, software, developer, hoboken, starts,...
3       [client, multinational, publishing, education,...
4       [position, lead, software, developer, location...
                              ...                        
9991    [position, description, position, description,...
9994    [job, description, researches, designs, develo...
9997    [job, description, candidate, experienced, mic...
9998    [apply, sponsorship, work, united, states, fut...
9999    [company, information, solid, reputation, pass...
Name: job_description, Length: 7583, dtype: object

### stemmed datasets

In [5]:
ps = PorterStemmer()
stemmed_description = sw_removed_description.apply(lambda s: [ps.stem(w) for w in s])
stemmed_description

0       [chosen, softwar, develop, part, larger, engin...
1       [posit, lead, softwar, develop, locat, middlet...
2       [senior, softwar, develop, hoboken, start, 912...
3       [client, multin, publish, educ, compani, seek,...
4       [posit, lead, softwar, develop, locat, philade...
                              ...                        
9991    [posit, descript, posit, descript, cgi, experi...
9994    [job, descript, research, design, develop, and...
9997    [job, descript, candid, experienc, microsoft, ...
9998    [appli, sponsorship, work, unit, state, futur,...
9999    [compani, inform, solid, reput, passion, endle...
Name: job_description, Length: 7583, dtype: object

### convert a datasets to a matrix of token count

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer=lambda x: x)
X = cv.fit_transform(stemmed_description)
X

<7583x35944 sparse matrix of type '<class 'numpy.int64'>'
	with 1296957 stored elements in Compressed Sparse Row format>

#### benchmark performance of each metric in multitreading

In [11]:
import timeit
import numpy as np

XX = X.toarray()


def timeit_matmul():
    time = timeit.timeit(lambda: np.matmul(XX, XX.T), number=3) / 3
    print("[{}] matmul(): {:.2f}".format("FAST" if time < 60 else "TOO LONG", time))
    return "matmul", time


def timeit_dok():
    time = timeit.timeit(lambda: X.todok() * X.T.todok(), number=3) / 3
    print("[{}] dok(): {:.2f}".format("FAST" if time < 60 else "TOO LONG", time))
    return "dok", time


def timeit_lil():
    time = timeit.timeit(lambda: X.tolil() * X.T.tolil(), number=3) / 3
    print("[{}] lil(): {:.2f}".format("FAST" if time < 60 else "TOO LONG", time))
    return "lil", time


def timeit_coo():
    time = timeit.timeit(lambda: X.tocoo() * X.T.tocoo(), number=3) / 3
    print("[{}] coo(): {:.2f}".format("FAST" if time < 60 else "TOO LONG", time))
    return "coo", time


def timeit_csc():
    time = timeit.timeit(lambda: X.tocsc() * X.T.tocsc(), number=3) / 3
    print("[{}] csc(): {:.2f}".format("FAST" if time < 60 else "TOO LONG", time))
    return "csc", time

### In the result
1. csc() -> 13.43 seconds
2. coo() -> 13.44 seconds
3. lil() -> 13.62 seconds
4. dok() -> 16.37 seconds
5. matmul() -> 1 hours 24 minutes **too long task**

In [12]:
from concurrent import futures

with futures.ThreadPoolExecutor(max_workers=5) as thread:
    thread.submit(timeit_matmul)
    thread.submit(timeit_dok)
    thread.submit(timeit_lil)
    thread.submit(timeit_coo)
    thread.submit(timeit_csc)
    thread.shutdown(wait=True)

[FAST] csc(): 13.43
[FAST] coo(): 13.44
[FAST] lil(): 13.62
[FAST] dok(): 16.37
[TOO LONG] matmul(): 4457.03
