# Keyword Extraction using TF_IDF

Import Packages

In [1]:
from nltk import tokenize

In [2]:
from operator import itemgetter

In [3]:
import math

Declare Variables

In [5]:
doc = 'Data Science Graduate with 2+ years of professional experience in building Machine Learning Models. Coursera certified associate and an enthusiastic team player adept at enhancing predictive modeling, data processing, data mining algorithms and passionate about using my analytical, statistical, and programming skills to collect, analyze, and interpret large data sets, so that by using this information to develop data-driven solutions to complicated business challenges. Worked with many machine learning frameworks such as TensorFlow, Keras, and Scikit-Learn.'

Remove stopwords

In [6]:
import nltk

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))

Finding Total words in document -- Term Frequency

In [8]:
total_words = doc.split()

In [9]:
total_words

['Data',
 'Science',
 'Graduate',
 'with',
 '2+',
 'years',
 'of',
 'professional',
 'experience',
 'in',
 'building',
 'Machine',
 'Learning',
 'Models.',
 'Coursera',
 'certified',
 'associate',
 'and',
 'an',
 'enthusiastic',
 'team',
 'player',
 'adept',
 'at',
 'enhancing',
 'predictive',
 'modeling,',
 'data',
 'processing,',
 'data',
 'mining',
 'algorithms',
 'and',
 'passionate',
 'about',
 'using',
 'my',
 'analytical,',
 'statistical,',
 'and',
 'programming',
 'skills',
 'to',
 'collect,',
 'analyze,',
 'and',
 'interpret',
 'large',
 'data',
 'sets,',
 'so',
 'that',
 'by',
 'using',
 'this',
 'information',
 'to',
 'develop',
 'data-driven',
 'solutions',
 'to',
 'complicated',
 'business',
 'challenges.',
 'Worked',
 'with',
 'many',
 'machine',
 'learning',
 'frameworks',
 'such',
 'as',
 'TensorFlow,',
 'Keras,',
 'and',
 'Scikit-Learn.']

In [10]:
total_word_length = len(total_words)

In [11]:
total_word_length

76

Find Total number of sentences -- Inverse Document Frequency

In [13]:
total_sentences = tokenize.sent_tokenize(doc)

In [15]:
total_sentences

['Data Science Graduate with 2+ years of professional experience in building Machine Learning Models.',
 'Coursera certified associate and an enthusiastic team player adept at enhancing predictive modeling, data processing, data mining algorithms and passionate about using my analytical, statistical, and programming skills to collect, analyze, and interpret large data sets, so that by using this information to develop data-driven solutions to complicated business challenges.',
 'Worked with many machine learning frameworks such as TensorFlow, Keras, and Scikit-Learn.']

In [16]:
total_sent_len = len(total_sentences)

In [17]:
total_sent_len

3

Calculate TF for each word

In [18]:
tf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
print(tf_score)

{'Data': 0.013157894736842105, 'Science': 0.013157894736842105, 'Graduate': 0.013157894736842105, '2+': 0.013157894736842105, 'years': 0.013157894736842105, 'professional': 0.013157894736842105, 'experience': 0.013157894736842105, 'building': 0.013157894736842105, 'Machine': 0.013157894736842105, 'Learning': 0.013157894736842105, 'Models': 0.013157894736842105, 'Coursera': 0.013157894736842105, 'certified': 0.013157894736842105, 'associate': 0.013157894736842105, 'enthusiastic': 0.013157894736842105, 'team': 0.013157894736842105, 'player': 0.013157894736842105, 'adept': 0.013157894736842105, 'enhancing': 0.013157894736842105, 'predictive': 0.013157894736842105, 'modeling,': 0.013157894736842105, 'data': 0.039473684210526314, 'processing,': 0.013157894736842105, 'mining': 0.013157894736842105, 'algorithms': 0.013157894736842105, 'passionate': 0.013157894736842105, 'using': 0.02631578947368421, 'analytical,': 0.013157894736842105, 'statistical,': 0.013157894736842105, 'programming': 0.01

Function to check if the word is present in a sentence list

In [19]:
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

Calculate IDF for each word

In [20]:
idf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

# Performing a log and divide
idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

print(idf_score)

{'Data': 1.0986122886681098, 'Science': 1.0986122886681098, 'Graduate': 1.0986122886681098, '2+': 1.0986122886681098, 'years': 1.0986122886681098, 'professional': 1.0986122886681098, 'experience': 1.0986122886681098, 'building': 1.0986122886681098, 'Machine': 1.0986122886681098, 'Learning': 1.0986122886681098, 'Models': 1.0986122886681098, 'Coursera': 1.0986122886681098, 'certified': 1.0986122886681098, 'associate': 1.0986122886681098, 'enthusiastic': 1.0986122886681098, 'team': 1.0986122886681098, 'player': 1.0986122886681098, 'adept': 1.0986122886681098, 'enhancing': 1.0986122886681098, 'predictive': 1.0986122886681098, 'modeling,': 1.0986122886681098, 'data': 0.0, 'processing,': 1.0986122886681098, 'mining': 1.0986122886681098, 'algorithms': 1.0986122886681098, 'passionate': 1.0986122886681098, 'using': 0.0, 'analytical,': 1.0986122886681098, 'statistical,': 1.0986122886681098, 'programming': 1.0986122886681098, 'skills': 1.0986122886681098, 'collect,': 1.0986122886681098, 'analyze,

Calculate TF * IDF

In [21]:
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
print(tf_idf_score)

{'Data': 0.014455424850896181, 'Science': 0.014455424850896181, 'Graduate': 0.014455424850896181, '2+': 0.014455424850896181, 'years': 0.014455424850896181, 'professional': 0.014455424850896181, 'experience': 0.014455424850896181, 'building': 0.014455424850896181, 'Machine': 0.014455424850896181, 'Learning': 0.014455424850896181, 'Models': 0.014455424850896181, 'Coursera': 0.014455424850896181, 'certified': 0.014455424850896181, 'associate': 0.014455424850896181, 'enthusiastic': 0.014455424850896181, 'team': 0.014455424850896181, 'player': 0.014455424850896181, 'adept': 0.014455424850896181, 'enhancing': 0.014455424850896181, 'predictive': 0.014455424850896181, 'modeling,': 0.014455424850896181, 'data': 0.0, 'processing,': 0.014455424850896181, 'mining': 0.014455424850896181, 'algorithms': 0.014455424850896181, 'passionate': 0.014455424850896181, 'using': 0.0, 'analytical,': 0.014455424850896181, 'statistical,': 0.014455424850896181, 'programming': 0.014455424850896181, 'skills': 0.014

Create a function to get N important words in the document

In [22]:
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

Get the top 5 words of significance

In [23]:
print(get_top_n(tf_idf_score, 5))

{'Data': 0.014455424850896181, 'Science': 0.014455424850896181, 'Graduate': 0.014455424850896181, '2+': 0.014455424850896181, 'years': 0.014455424850896181}
