description

In [9]:
import numpy as np
from nltk.probability import *
from itertools import chain

def stats_print(tk_description):
    words = list(chain.from_iterable(tk_description)) # we put all the tokens in the corpus in a single list
    vocab = set(words) # compute the vocabulary by converting the list of words/tokens to a set, i.e., giving a set of unique words
    lexical_diversity = len(vocab)/len(words)
    print("Vocabulary size: ",len(vocab))
    print("Total number of tokens: ", len(words))
    print("Lexical diversity: ", lexical_diversity)
    print("Total number of description:", len(tk_description))
    lens = [len(article) for article in tk_description]
    print("Average description length:", np.mean(lens))
    print("Maximun description length:", np.max(lens))
    print("Minimun description length:", np.min(lens))
    print("Standard deviation of description length:", np.std(lens))


## Importing libraries 

In [2]:
from itertools import chain
import pandas as pd

# Code to import libraries as you need in this assessment, e.g.,
# Read job_ad.csv
job_ad = pd.read_csv('job_ad.csv')
# print first 3 rows
job_ad.head(3)
# get the description of the job ad
description = job_ad['Description']
# get the tokenized description of the job ad
tk_description = job_ad['Tokenized Description']
webindex = job_ad['Webindex']
vocab = sorted(list(chain.from_iterable(tk_description)))
print(tk_description)
len(vocab)



0      accountant partqualified south east london cli...
1      leading hedge funds london recruiting fund acc...
2      exciting opportunity arisen join establish pro...
3      onetwotrade expanding sales team recruiting ju...
4      rgn nurses hospitals fulltime part timehours s...
                             ...                        
771    apply today start tomorrow sales money career ...
772    main purpose job perform range mechanical asse...
773    sales associate hip knee orthopaedics market l...
774    mobile super optometrist required join uk lead...
775    title field sales executive office supplies so...
Name: Tokenized Description, Length: 776, dtype: object


899091

In [10]:
descriptionFile = './description.txt'
with open(descriptionFile) as f:
    tk_descriptions = f.read().splitlines() # read all the descriptions into a list

In [11]:
print(len(tk_descriptions))
emp = 10
tk_descriptions[emp]

776


'client based eastleigh investments treasury controller join team duties include responsibility transactional management analysis oversight company investment portfolio including compliance relevant sections relevant policies ensure working capital liquid resources cashflow managed efficiently deliver consistently relevant kpis kris analysing shortfalls putting action plans place remediate process issues manage day day relationships company outsourced investment managers custodians ensuring mutual understanding operations systems developments business transacted efficiently effectively endtoend investment processes ensuring processes procedures risks controls documented effective efficient regularly review test processes controls accordance finance control risk framework skills experience ability build relationships stakeholders levels internal external challenge assumptions positively bring people journey strong written communication skills fluent articulate confident spoken communica

#### Converting each description text string into list of tokens

In [12]:
tk_descriptions = [description.split(" ") for description in tk_descriptions] # note that we have to revert the join string into list of tokens
tk_descriptions[emp]

['client',
 'based',
 'eastleigh',
 'investments',
 'treasury',
 'controller',
 'join',
 'team',
 'duties',
 'include',
 'responsibility',
 'transactional',
 'management',
 'analysis',
 'oversight',
 'company',
 'investment',
 'portfolio',
 'including',
 'compliance',
 'relevant',
 'sections',
 'relevant',
 'policies',
 'ensure',
 'working',
 'capital',
 'liquid',
 'resources',
 'cashflow',
 'managed',
 'efficiently',
 'deliver',
 'consistently',
 'relevant',
 'kpis',
 'kris',
 'analysing',
 'shortfalls',
 'putting',
 'action',
 'plans',
 'place',
 'remediate',
 'process',
 'issues',
 'manage',
 'day',
 'day',
 'relationships',
 'company',
 'outsourced',
 'investment',
 'managers',
 'custodians',
 'ensuring',
 'mutual',
 'understanding',
 'operations',
 'systems',
 'developments',
 'business',
 'transacted',
 'efficiently',
 'effectively',
 'endtoend',
 'investment',
 'processes',
 'ensuring',
 'processes',
 'procedures',
 'risks',
 'controls',
 'documented',
 'effective',
 'efficient'

#### Explore the current statistics

In [13]:
stats_print(tk_descriptions)

Vocabulary size:  9423
Total number of tokens:  107751
Lexical diversity:  0.08745162457889022
Total number of description: 776
Average description length: 138.85438144329896
Maximun description length: 489
Minimun description length: 12
Standard deviation of description length: 73.42099464751045


#### Reading the corresponding category labels

In [14]:
categoryFile = './category.txt'
with open(categoryFile) as f:
    category = f.read().splitlines() # read all the category into a list

#### Making sure we done it right
Take an example, e.g., the 10th element

In [15]:
emp = 10
print(len(category))
category[emp]

776


'0'

Convert the loaded category labels to integers:

In [16]:
category = [int(s) for s in category]

In [17]:
sum(category) # seeing the total number of

1095

## Task 2. Processing the Most and Less Frequent Words¶

In task 2, you are required to write codes to explore and handle the most and less frequent words.

### Task 2.1 Most Frequent Words

In this subtask, you will write code to explore the most frequent words in the pre-processed tokenized review text corpus. You will need to:
* explore the most frequent words (top 25) based on term frequency and document frequency, respectively
* compare the results using different frequency measurements, which words are extracted based on both frequency measurements?
* think and decide on whether or not you would remove some of the most frequent words

In [55]:
from nltk.probability import *
from itertools import chain

words = list(chain.from_iterable(tk_description)) # we put all the tokens in the corpus in a single list

### Most frequent words w.r.t. Term Frequency
We first explore the most frequent words in terms of term frequency:

In [56]:
term_fd = FreqDist(words) # compute term frequency for each unique word/type

In [57]:
term_fd.most_common(25)

[('film', 10379),
 ('movie', 6622),
 ('one', 5709),
 ('not', 5531),
 ('character', 3716),
 ('like', 3658),
 ('time', 2851),
 ('get', 2785),
 ('scene', 2648),
 ('make', 2584),
 ('even', 2558),
 ('no', 2414),
 ('good', 2340),
 ('story', 2289),
 ('would', 2043),
 ('much', 2024),
 ('also', 1965),
 ('see', 1864),
 ('way', 1856),
 ('two', 1827),
 ('life', 1813),
 ('first', 1768),
 ('go', 1723),
 ('well', 1669),
 ('thing', 1651)]

### Most frequent words w.r.t. Document Frequency
We then explore the most frequent words in terms of document frequency:

In [58]:
words_2 = list(chain.from_iterable([set(review) for review in tk_description]))
doc_fd = FreqDist(words_2)  # compute document frequency for each unique word/type
doc_fd.most_common(25)

[('film', 1772),
 ('one', 1760),
 ('not', 1672),
 ('movie', 1630),
 ('like', 1491),
 ('character', 1412),
 ('time', 1382),
 ('get', 1317),
 ('make', 1309),
 ('even', 1291),
 ('no', 1189),
 ('scene', 1166),
 ('good', 1156),
 ('much', 1128),
 ('would', 1113),
 ('story', 1091),
 ('way', 1073),
 ('also', 1070),
 ('go', 1037),
 ('two', 1025),
 ('first', 1019),
 ('see', 1015),
 ('well', 991),
 ('take', 983),
 ('come', 967)]

The list seems quite similar, let's what are in common and what are different based on the two frequency measurements.

In [59]:
tf_words = set(w[0] for w in term_fd.most_common(25))
df_words = set(w[0] for w in doc_fd.most_common(25))

tf_words.union(df_words) # frequent words in both measurements

{'also',
 'character',
 'come',
 'even',
 'film',
 'first',
 'get',
 'go',
 'good',
 'life',
 'like',
 'make',
 'movie',
 'much',
 'no',
 'not',
 'one',
 'scene',
 'see',
 'story',
 'take',
 'thing',
 'time',
 'two',
 'way',
 'well',
 'would'}

In [60]:
# words are most frequent based on term frequence, but not document frequence
tf_words.difference(df_words)

{'life', 'thing'}

In [61]:
# words are most frequent based on document frequence, but not term frequence
df_words.difference(tf_words)

{'come', 'take'}

Indeed, most of thes words seems to have a bit of taste (indication on sentiment).
We decided not to remove them.

### Task 2.2 Less Frequent Words

Now, let's move on to the less frequent words. In this subtask, you are required to:
* find out the list of words that appear only once in the **entire corpus**
* remove these less frequent words from each tokenized review text


We first need to find out the set of less frequent words by using the `hapaxes` function applied on the **term frequency** dictionary.

In [62]:
lessFreqWords = set(term_fd.hapaxes())
lessFreqWords

{"assailant's",
 'top-down',
 'biloxi',
 'almost-subliminal',
 'optic',
 'informal',
 'mustard',
 'abstraction',
 'urinary',
 'urbanite',
 'fondest',
 'pleasingly',
 'genial',
 'lestercorp',
 'commensurately',
 "rabin's",
 'curry-spiced',
 'brio',
 'haise',
 'springfield',
 'dennison',
 'whiz-kid',
 'schwinn',
 'faddish',
 'assent',
 'moonlighted',
 'zophres',
 'action-oriented',
 'genovia',
 'prospecting',
 'someway',
 "debney's",
 "rafe's",
 'lodger',
 'unpardonable',
 'dexterous',
 'babe-a',
 'ice-cold',
 "tristen's",
 'beliefs-eloquently',
 'consciouness',
 'funniest-',
 'leathery',
 'discarged',
 'youull',
 'fronted',
 'trumpeting',
 'lesabre',
 'dully',
 "joker's",
 "biebe's",
 'live-',
 'schweig',
 'clemente',
 'miata',
 're-inforced',
 'felony-studded',
 'barby',
 "rule'",
 'douse',
 'enmity',
 'limbaugh',
 "in'",
 'femke',
 'goth-style',
 "epp's",
 'reserved-way',
 'detrimentally',
 'slogging',
 'nicjolson',
 'uta',
 'penile',
 'heart-felt',
 'vexatiousness',
 'stater',
 'back

In [63]:
len(lessFreqWords)

18933

Oh, a lot!!! Many of them appear to be quite ad hoc.
Let's remove them:

In [64]:
def removeLessFreqWords(review):
    return [w for w in review if w not in lessFreqWords]

tk_description = [removeLessFreqWords(review) for review in tk_description]

In [65]:
stats_print(tk_description)

Vocabulary size:  25197
Total number of tokens:  694956
Lexical diversity:  0.036256971664393144
Total number of description: 2002
Average description length: 347.13086913086914
Maximun description length: 1329
Minimun description length: 8
Standard deviation of description length: 150.9827244192884


## Task 3. Finding Bigrams

In this task, you are required to explore the bigrams (top 25) in the pre-processed review text. You are also required to write code to include the bigrams that you think make sense in to the vocabulary.

Finding the list of top 25 bigrams:

In [66]:
from nltk.util import ngrams
bigrams = ngrams(words, n = 2)
fdbigram = FreqDist(bigrams)

In [67]:
bigrams = fdbigram.most_common(25) # top 25 bigrams
bigrams

[(('special', 'effect'), 388),
 (('look', 'like'), 257),
 (('new', 'york'), 244),
 (('even', 'though'), 222),
 (('no', 'one'), 194),
 (('bad', 'guy'), 184),
 (('high', 'school'), 175),
 (('film', 'like'), 171),
 (("i'm", 'not'), 171),
 (('take', 'place'), 169),
 (('film', 'not'), 164),
 (('star', 'war'), 159),
 (('one', 'best'), 141),
 (('main', 'character'), 135),
 (('horror', 'film'), 133),
 (('movie', 'like'), 131),
 (('one', 'thing'), 130),
 (('action', 'film'), 128),
 (('science', 'fiction'), 126),
 (('not', 'even'), 126),
 (('year', 'old'), 124),
 (("don't", 'know'), 123),
 (('movie', 'not'), 121),
 (('action', 'sequence'), 120),
 (('year', 'ago'), 120)]

In [68]:
rep_patterns = [" ".join(bg[0]) for bg in bigrams]
rep_patterns

['special effect',
 'look like',
 'new york',
 'even though',
 'no one',
 'bad guy',
 'high school',
 'film like',
 "i'm not",
 'take place',
 'film not',
 'star war',
 'one best',
 'main character',
 'horror film',
 'movie like',
 'one thing',
 'action film',
 'science fiction',
 'not even',
 'year old',
 "don't know",
 'movie not',
 'action sequence',
 'year ago']

Most of them make sense and constructed meaningful phase, except `film like`,`film not`,`movie like`,`movie not`.
Therefore, we will include all the bigrams in the vocabulary, except the above mentioned ones.

In [69]:
filtered = ['film like','film not','movie like','movie not'] # define a list of bigrams that we won't include
rep_patterns = [bg for bg in rep_patterns if bg not in filtered] # create a list of bigrams that we want to include
rep_patterns

['special effect',
 'look like',
 'new york',
 'even though',
 'no one',
 'bad guy',
 'high school',
 "i'm not",
 'take place',
 'star war',
 'one best',
 'main character',
 'horror film',
 'one thing',
 'action film',
 'science fiction',
 'not even',
 'year old',
 "don't know",
 'action sequence',
 'year ago']

In [70]:
replacements = [bg.replace(" ","_") for bg in rep_patterns] # convert the format of bigram into word1_word2
replacements

['special_effect',
 'look_like',
 'new_york',
 'even_though',
 'no_one',
 'bad_guy',
 'high_school',
 "i'm_not",
 'take_place',
 'star_war',
 'one_best',
 'main_character',
 'horror_film',
 'one_thing',
 'action_film',
 'science_fiction',
 'not_even',
 'year_old',
 "don't_know",
 'action_sequence',
 'year_ago']

In the following, we join each tokenized review text, and replace the bigrams with the format 'word1_word2', and then we re-tokenized them again into list of tokens. As such, each bigram that we want to include in the vocabulary will become a single token.

In [71]:
import re
tk_description = [" ".join(review) for review in tk_description] # construct the review string

for i in range(0, len(tk_description)):
    for j in  range(0,len(rep_patterns)):
        tk_description[i] = re.sub(rep_patterns[j], replacements[j], tk_description[i]) # replace with bigram representation

tk_description = [review.split(" ") for review in tk_description] # convert back to tokenised review

Have a look at the stats again :)

In [72]:
stats_print(tk_description)

Vocabulary size:  25267
Total number of tokens:  691252
Lexical diversity:  0.036552516303750296
Total number of description: 2002
Average description length: 345.28071928071927
Maximun description length: 1327
Minimun description length: 7
Standard deviation of description length: 150.14967263929825


## Task 4. Constructing the Vocabulary

Now, we complete all the basic pre-process step and we are ready to move to feature generation! &#129321;
Before we start, in this task, you are required to construct the final vocabulary, e.g., `vocab`:

In [73]:
# generating the vocabulary

words = list(chain.from_iterable(tk_description)) # we put all the tokens in the corpus in a single list
vocab = sorted(list(set(words))) # compute the vocabulary by converting the list of words/tokens to a set, i.e., giving a set of unique words

len(vocab)

25267

## Task 5. Generating Feature Vectors

In this task, we are going to generate feature vectors from tokenized review text. We are going to explore different feature vectors, including binary, count, and tf-idf vectors.

### Task 5.1 Generating Binary Vectors
In this subtask, let's start with generating the binary vector representation for each review.

We need to first import the `CountVectorizer` and initialise it.

In [74]:
# binding the words together for each review
joined_reviews = [' '.join(review) for review in tk_description]

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
bVectorizer = CountVectorizer(analyzer = "word",binary = True,vocabulary = vocab) # initialise the CountVectorizer

In [76]:
binary_features = bVectorizer.fit_transform(joined_reviews)
binary_features.shape



(2002, 25267)

### Task 5.2 Generating Count Vectors

In this subtasks, you are required to generate the count vector features of review texts.

In [77]:
cVectorizer = CountVectorizer(analyzer = "word",vocabulary = vocab) # initialised the CountVectorizer
count_features = cVectorizer.fit_transform(joined_reviews)
count_features.shape

(2002, 25267)

### Task 5.3 Generating TF-IDF Vectors

In this subtasks, you are required to generate the count vector features of review texts.

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
tVectorizer = TfidfVectorizer(analyzer = "word",vocabulary = vocab) # initialised the TfidfVectorizer
tfidf_features = tVectorizer.fit_transform(joined_reviews) # generate the tfidf vector representation for all articles
tfidf_features.shape

(2002, 25267)

In [79]:
joined_reviews

["by-the number film introduces character situation dilemma development we've seen parade film film easily guessed end frame number one film packed cap predictability leading little tension excitement suspense interest part paying audience short clich ridden formula film welcome review general's daughter plot undercover army detective rape counselor find locked inside investigation bigwig general daughter's rape torture murder must delve unspoken army rule figure conspiracy behind shocking murder critique number see film sits big screen couple hour float around go away hopefully never heard predictable even blind man could see plot point coming mile away suspenseful leaf dropping tree action-packed canadian tournament get picture sure bad took friend le two minute figure entire plot break scene even completed easy pie unfortunate james wood john travolta actually one extremely enjoyable scene together near beginning film ala not scene alone scored two four point wood chew scene he's tr

In [80]:
tfidf_features

<2002x25267 sparse matrix of type '<class 'numpy.float64'>'
	with 501993 stored elements in Compressed Sparse Row format>

<h3 style="color:#ffc0cb;font-size:50px;font-family:Georgia;text-align:center;"><strong>Task 2. Generating Feature Representations</strong></h3>

So let's say we do binary feature representation but with 3 types of data, the title, the description, and title+description.

In [5]:
from collections import Counter


"""
Bag-of-words model:
Generate the Count vector representation for each job advertisement description, and save
them into a file (please refer to the required output). Note, the generated Count vector
representation must be based on the generated vocabulary in Task 1 (as saved in vocab.txt).
"""
# bag of words model
def bag_of_words(description, vocab):
    # create a list of 0s with the same length as the vocab
    bow = [0] * len(vocab)
    # count the number of times each word appears in the description
    word_counts = Counter(description)
    # update the bow list with the word counts
    for word, count in word_counts.items():
        bow[vocab.index(word)] = count
    return bow

# Generate the Count vector representation for each job advertisement description
bow = [bag_of_words(description, vocab) for description in tk_description]

In [6]:
bow

[[67,
  0,
  41,
  2,
  42,
  23,
  54,
  8,
  8,
  8,
  40,
  0,
  1,
  30,
  12,
  46,
  40,
  15,
  3,
  28,
  34,
  42,
  23,
  2,
  0,
  2,
  6,
  0],
 [66,
  0,
  35,
  5,
  29,
  23,
  46,
  13,
  11,
  5,
  33,
  2,
  1,
  15,
  4,
  54,
  28,
  10,
  2,
  27,
  27,
  37,
  31,
  7,
  4,
  0,
  2,
  0],
 [104,
  0,
  50,
  12,
  18,
  27,
  117,
  5,
  27,
  16,
  62,
  7,
  7,
  26,
  26,
  53,
  44,
  25,
  3,
  66,
  55,
  40,
  14,
  10,
  8,
  6,
  10,
  0],
 [49,
  0,
  24,
  5,
  17,
  14,
  44,
  3,
  8,
  2,
  31,
  1,
  3,
  16,
  7,
  34,
  27,
  10,
  2,
  28,
  27,
  28,
  11,
  2,
  4,
  1,
  4,
  0],
 [122,
  0,
  47,
  9,
  34,
  25,
  121,
  19,
  19,
  12,
  72,
  1,
  2,
  42,
  32,
  75,
  60,
  40,
  5,
  68,
  85,
  64,
  41,
  11,
  6,
  3,
  15,
  0],
 [281,
  2,
  135,
  84,
  82,
  62,
  183,
  15,
  33,
  22,
  144,
  1,
  13,
  85,
  44,
  207,
  110,
  121,
  6,
  127,
  189,
  138,
  48,
  12,
  12,
  9,
  19,
  0],
 [126,
  0,
  74,
  7,
  40,
  2

<h3 style="color:#ffc0cb;font-size:50px;font-family:Georgia;text-align:center;"><strong>2.1 Saving outputs</strong></h3>

Save the count vector representation as per spectification.
- `count_vectors.txt`

`count_vectors.txt` stores the sparse count vector representation of job advertisement descriptions in the following format. Each line of this file corresponds to one advertisement. It starts with a ‘#’ key followed by the webindex of the job advertisement, and a comma ‘,’. The rest of the line is the sparse representation of the corresponding description in the form of word_integer_index:word_freq separated by comma. Following is an example of the file format.

In [10]:
# save count vector representation of job advertisement descriptions
with open('count_vectors.txt', 'w') as f:
    for i, description in enumerate(tk_description):
        f.write('#' + str(webindex[i]) + ',')
        for word in description:
            f.write(str(vocab.index(word)) + ':' + str(bow[i][vocab.index(word)]) + ',')
        f.write('\n')
    print("Successfully write count vector representation of job advertisement descriptions into count_vectors.txt file")

Successfully write count vector representation of job advertisement descriptions in txt file


<h3 style="color:#ffc0cb;font-size:50px;font-family:Georgia;text-align:center;"><strong>Task 3. Job Advertisement Classification</strong></h3>

...... Sections and code blocks on buidling classification models based on different document feature represetations. 
Detailed comparsions and evaluations on different models to answer each question as per specification. 

<span style="color: red"> You might have complex notebook structure in this section, please feel free to create your own notebook structure. </span>

In [None]:
# Code to perform the task...


In [2]:
import os

# The .py format of the jupyter notebook
for fname in os.listdir():
    if fname.endswith('ipynb'):
        os.system(f'jupyter nbconvert {fname} --to python')

[NbConvertApp] Converting notebook task1.ipynb to python
[NbConvertApp] Writing 20584 bytes to task1.py
[NbConvertApp] Converting notebook task2_3.ipynb to python
[NbConvertApp] Writing 7821 bytes to task2_3.py


<h3 style="color:#ffc0cb;font-size:50px;font-family:Georgia;text-align:center;"><strong>Summary</strong></h3>
Give a short summary and anything you would like to talk about the assessment tasks here.

## Couple of notes for all code blocks in this notebook
- please provide proper comment on your code
- Please re-start and run all cells to make sure codes are runable and include your output in the submission.   
<span style="color: red"> This markdown block can be removed once the task is completed. </span>