## Resume Word Analysis
This notebook analyzes the frequency of words in a resume and visualizes the top terms before and after removing stop words.

In [None]:
import re
from collections import Counter
from pathlib import Path

resume_path = Path('Q2/data/suprawee_resume.txt')
stop_words_path = Path('Q2/data/stop_words.txt')
figures_dir = Path('Q2/figures')
figures_dir.mkdir(parents=True, exist_ok=True)

def save_svg_bar(data, filename, title):
    width = 800
    height = 400
    bar_width = width / len(data)
    max_count = max(count for _, count in data)
    scale = (height - 50) / max_count
    svg_parts = [f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">']
    svg_parts.append(f'<text x="{width/2}" y="20" text-anchor="middle" font-size="16" fill="white">{title}</text>')
    for i, (word, count) in enumerate(data):
        x = i * bar_width
        bar_height = count * scale
        y = height - bar_height - 20
        svg_parts.append(f'<rect x="{x}" y="{y}" width="{bar_width-2}" height="{bar_height}" fill="steelblue"/>')
        svg_parts.append(f'<text x="{x + bar_width/2}" y="{height - 5}" text-anchor="middle" font-size="10" transform="rotate(45 {x + bar_width/2},{height - 5})" fill="white">{word}</text>')
    svg_parts.append('</svg>')
    Path(filename).write_text('
'.join(svg_parts), encoding='utf-8')

text = resume_path.read_text(encoding='utf-8')
words = re.findall(r'\b\w+\b', text.lower())
word_counts = Counter(words)
top_resume = word_counts.most_common(20)
save_svg_bar(top_resume, figures_dir/'resume_words.svg', 'Top 20 Resume Words')
top_resume[:5]


In [None]:
stop_words = set(stop_words_path.read_text(encoding='utf-8').split())
specific_words = [w for w in words if w not in stop_words]
specific_counts = Counter(specific_words)
top_specific = specific_counts.most_common(20)
save_svg_bar(top_specific, figures_dir/'specific_words.svg', 'Top 20 Specific Words')
top_specific[:5]


Removing common stop words reveals keywords that emphasize technical skills and experiences, such as **developed**, **python**, and **project**, giving more insight into the resume's content.