In [116]:
import pandas as pd

In [117]:
# -*- coding: UTF-8 -*-
"""Resume Analysis Module."""

import os
import string

# Counter is used later in the program
from collections import Counter

# Paths
resume_path = os.path.join("resume.md")


In [118]:
# Skills to match
REQUIRED_SKILLS = {"excel", "python", "mysql", "statistics"}
DESIRED_SKILLS = {"r", "git", "html", "css", "leaflet"}

In [119]:
# define the function to load a file
def load_file(filepath):
    """Helper function to read a file and return the data."""
    with open(filepath, "r") as resume_file_handler:
        return resume_file_handler.read().lower().split()

In [120]:
# Grab the text for a Resume
word_list = load_file(resume_path)

# Create a set of unique words from the resume
resume = set()

# Remove trailing punctuation from words
for token in word_list:
    resume.add(token.split(',')[0].split('.')[0])

In [121]:
# Remove Punctuation that were read as whole words
punctuation = set(string.punctuation)
resume = resume - punctuation
print(resume)

{'advanced', 'css', 'excel', 'media', 'd3', 'hadoop', 'designing', 'working', 'javascript', 'from', 'boot', 'developing', 'mining', 'web', 'html/css', 'learning', 'experience', 'graduate', 'modeling', 'and', 'social', 'bootstrap', 'interests', 'leaflet', 'data', 'machine', 'in', 'open-source', 'sets', 'apis', 'statistics', 'writing', 'python', 'basic', 'pandas', 'cloud', 'aws', 'mongodb', 'sql', 'tableau', 'visualizations', 'creating', 'forecasting', 'business', 'dartling', 'git/github', 'scripts', 'pivot', 'camp', 'big', 'front-end', 'visualization', 'performing', 'interactions', 'with', 'd', 'to', 'microsoft', 'tables', 'using', 'algorithms', 'software', 'vba', 'the', 'intelligence', 'analytics', 'files', 'api', 'analyze', 'skills', 'education', '##', 'dartanion', 'mysql', 'html', 'r', 'contributing', 'databases', 'apps'}


In [122]:
# Calculate the Required Skills Match using Set Intersection
print("REQUIRED SKILLS")
print("=============")
print(resume & REQUIRED_SKILLS)

REQUIRED SKILLS
{'statistics', 'excel', 'mysql', 'python'}


In [123]:
# Calculate the Desired Skills Match using Set Intersection
print("DESIRED SKILLS")
print("=============")
print(resume & DESIRED_SKILLS)

DESIRED SKILLS
{'leaflet', 'css', 'r', 'html'}


In [124]:
# Resume Word Count
# ==========================
# Initialize a dictionary with default values equal to zero
word_count = {}.fromkeys(word_list, 0)


In [125]:
# Loop through the word list and count each word.
for word in word_list:
    word_count[word] += 1
word_count

{'#': 1,
 'dartanion': 1,
 'd.': 1,
 'dartling': 1,
 '##': 4,
 'education': 1,
 '*': 15,
 'data': 7,
 'analytics': 3,
 'and': 8,
 'visualization': 2,
 'boot': 1,
 'camp': 1,
 'graduate': 1,
 'experience': 1,
 'creating': 1,
 'pivot': 1,
 'tables': 1,
 'vba': 1,
 'scripts': 2,
 'in': 2,
 'excel.': 1,
 'modeling': 1,
 'forecasting': 1,
 'using': 5,
 'basic': 1,
 'statistics': 1,
 'writing': 1,
 'python': 3,
 'to': 2,
 'analyze': 1,
 'sets': 1,
 'from': 1,
 'files': 1,
 'apis.': 1,
 'social': 2,
 'media': 2,
 'mining': 1,
 'working': 3,
 'with': 6,
 'mysql': 1,
 'mongodb': 1,
 'databases': 1,
 'developing': 1,
 'front-end': 1,
 'web': 2,
 'visualizations': 1,
 'html,': 2,
 'css,': 2,
 'bootstrap,': 1,
 'd3,': 1,
 'leaflet.js': 1,
 'the': 2,
 'tableau': 1,
 'business': 1,
 'intelligence': 1,
 'software': 2,
 'performing': 1,
 'big': 2,
 'hadoop': 1,
 'machine': 2,
 'learning': 1,
 'algorithms': 1,
 'skills': 1,
 'microsoft': 1,
 'excel,': 1,
 'python,': 1,
 'javascript,': 2,
 'html/css,': 

In [126]:
# Using collections.Counter
word_counter = Counter(word_list)
print(word_counter)

Counter({'*': 15, 'and': 8, 'data': 7, 'with': 6, 'using': 5, '##': 4, 'analytics': 3, 'python': 3, 'working': 3, 'visualization': 2, 'scripts': 2, 'in': 2, 'to': 2, 'social': 2, 'media': 2, 'web': 2, 'html,': 2, 'css,': 2, 'the': 2, 'software': 2, 'big': 2, 'machine': 2, 'javascript,': 2, '#': 1, 'dartanion': 1, 'd.': 1, 'dartling': 1, 'education': 1, 'boot': 1, 'camp': 1, 'graduate': 1, 'experience': 1, 'creating': 1, 'pivot': 1, 'tables': 1, 'vba': 1, 'excel.': 1, 'modeling': 1, 'forecasting': 1, 'basic': 1, 'statistics': 1, 'writing': 1, 'analyze': 1, 'sets': 1, 'from': 1, 'files': 1, 'apis.': 1, 'mining': 1, 'mysql': 1, 'mongodb': 1, 'databases': 1, 'developing': 1, 'front-end': 1, 'visualizations': 1, 'bootstrap,': 1, 'd3,': 1, 'leaflet.js': 1, 'tableau': 1, 'business': 1, 'intelligence': 1, 'performing': 1, 'hadoop': 1, 'learning': 1, 'algorithms': 1, 'skills': 1, 'microsoft': 1, 'excel,': 1, 'python,': 1, 'html/css,': 1, 'api': 1, 'interactions,': 1, 'mining,': 1, 'sql,': 1, 'h

In [127]:
# Comparing both word count solutions
print(word_count == word_counter)

True


In [128]:
# Top 10 Words


print("Top 10 Words")
print("=============")

Top 10 Words


In [130]:
# Don't worry about the underscore in front of _word_count
# It is just convention for internal use only
# More info here: https://dbader.org/blog/meaning-of-underscores-in-python

# Word Punctuation Cleaning
word_list = [word for word in word_list if word not in string.punctuation]
print('\nWORD LIST AFTER PUNCTUATION REMOVAL')
print(word_list)

# Character Punctuation Cleaning
word_list = [''.join(char for char in word if char not in string.punctuation) for word in word_list]
print('\nWORD LIST AFTER CHARACTER PUNCTUATION REMOVAL')
print(word_list)

#YOUR CODE HERE hint:
# Hint: return only words that are not in string.punctuaton
# Hint: consider using a list comprehension


WORD LIST AFTER PUNCTUATION REMOVAL
['dartanion', 'd.', 'dartling', '##', 'education', 'data', 'analytics', 'and', 'visualization', 'boot', 'camp', 'graduate', '##', 'experience', 'creating', 'pivot', 'tables', 'and', 'vba', 'scripts', 'in', 'excel.', 'modeling', 'and', 'forecasting', 'data', 'using', 'basic', 'statistics', 'writing', 'python', 'scripts', 'to', 'analyze', 'data', 'sets', 'from', 'files', 'and', 'apis.', 'social', 'media', 'mining', 'using', 'python', 'working', 'with', 'mysql', 'and', 'mongodb', 'databases', 'developing', 'front-end', 'web', 'visualizations', 'using', 'html,', 'css,', 'bootstrap,', 'd3,', 'and', 'leaflet.js', 'using', 'the', 'tableau', 'business', 'intelligence', 'software', 'performing', 'big', 'data', 'analytics', 'with', 'hadoop', 'working', 'with', 'machine', 'learning', 'algorithms', '##', 'skills', 'microsoft', 'excel,', 'python,', 'javascript,', 'html/css,', 'api', 'interactions,', 'social', 'media', 'mining,', 'sql,', 'hadoop,', 'tableau,', 'a

In [132]:
# Clean Stop Words
stop_words = ["and", "with", "using", "##", "working", "in", "to"]
word_list = [word for word in word_list if word not in stop_words]
word_list = [word for word in word_list if word]
print('\nWORD LIST AFTER STOP WORDS')
print(word_list)


WORD LIST AFTER STOP WORDS
['dartanion', 'd', 'dartling', 'education', 'data', 'analytics', 'visualization', 'boot', 'camp', 'graduate', 'experience', 'creating', 'pivot', 'tables', 'vba', 'scripts', 'excel', 'modeling', 'forecasting', 'data', 'basic', 'statistics', 'writing', 'python', 'scripts', 'analyze', 'data', 'sets', 'from', 'files', 'apis', 'social', 'media', 'mining', 'python', 'mysql', 'mongodb', 'databases', 'developing', 'frontend', 'web', 'visualizations', 'html', 'css', 'bootstrap', 'd3', 'leafletjs', 'the', 'tableau', 'business', 'intelligence', 'software', 'performing', 'big', 'data', 'analytics', 'hadoop', 'machine', 'learning', 'algorithms', 'skills', 'microsoft', 'excel', 'python', 'javascript', 'htmlcss', 'api', 'interactions', 'social', 'media', 'mining', 'sql', 'hadoop', 'tableau', 'advanced', 'statistics', 'machine', 'learning', 'r', 'gitgithub', 'interests', 'contributing', 'opensource', 'software', 'data', 'analytics', 'python', 'pandas', 'designing', 'data', 

In [133]:
word_count = {}.fromkeys(word_list, 0)

# Loop through the word list and count each word.
for word in word_list:
    word_count[word] += 1

# Sort words by count and print the top 10
sorted_words = []
for word in sorted(word_count, key=word_count.get, reverse=True)[:10]:
    print(f"Token: {word:20} Count: {word_count[word]}")

Token: data                 Count: 7
Token: python               Count: 4
Token: analytics            Count: 3
Token: visualization        Count: 2
Token: scripts              Count: 2
Token: excel                Count: 2
Token: statistics           Count: 2
Token: social               Count: 2
Token: media                Count: 2
Token: mining               Count: 2
