In [1]:
import os
import json
import re
import pandas as pd
import seaborn as sns

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium
import matplotlib.pyplot as plt
import sys
sys.path.append('./people')

import importlib
from people import resume_parser
import resume_parser
importlib.reload(resume_parser)
from resume_parser import parse_resume_text
plt.rcParams['figure.facecolor'] = 'white'  # Background of the whole figure
plt.rcParams['axes.facecolor'] = 'white'    # Background of the plot area


In [2]:
with open("gov_individuals.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    
df = pd.DataFrame(data)
print(df.shape)
print(df.head())

In [3]:

# Extract cabinet year range into a new column
import re
def extract_year_range(text):
    match = re.search(r'(\d{4})\s*-\s*(\d{4})', text)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

df[["cabinet_start_year", "cabinet_end_year"]] = df["cabinet"].apply(
    lambda x: pd.Series(extract_year_range(x)) if pd.notnull(x) else pd.Series([None, None])
)

# Count number of members in each cabinet start year
cabinet_counts = df["cabinet_start_year"].value_counts().sort_index()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=cabinet_counts.index.astype(int), y=cabinet_counts.values, color="skyblue")
plt.title("Number of Government Members by Cabinet Start Year")
plt.xlabel("Cabinet Start Year")
plt.ylabel("Number of Members")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [4]:
# Hometown stats (full dataset)
hometown_series = df["resume_text"].apply(lambda text: parse_resume_text(text).get("hometown") if pd.notnull(text) else None)
df["parsed_hometown"] = hometown_series

# 1. Unique hometowns
unique_hometowns = df["parsed_hometown"].dropna().unique()
print("Number of unique hometowns:", len(unique_hometowns))

# 2. Hometown frequency
hometown_counts = df["parsed_hometown"].value_counts()
print("\nTop 10 hometowns by frequency:")
print(hometown_counts.head(10))

# 3. Total hometown data points (non-null)
total_hometowns = df["parsed_hometown"].notnull().sum()
print("\nTotal parsed hometown entries:", total_hometowns)

sns.set_style("whitegrid")

# Filter rows with resume_text and apply parser
df_with_resumes = df[df["resume_text"].notnull()].copy()
parsed_data = df_with_resumes["resume_text"].apply(parse_resume_text)
parsed_df = pd.json_normalize(parsed_data)
df_with_resumes = pd.concat([df_with_resumes.reset_index(drop=True), parsed_df], axis=1)

# Simplify 'hometown'
df_with_resumes["hometown_simplified"] = df_with_resumes["hometown"].apply(
    lambda x: re.sub(r"(?i)^tỉnh\s+", "", x.split(",")[-1].strip()) if isinstance(x, str) and "," in x else x)

# Top 10 simplified hometowns
top_simplified_hometowns = df_with_resumes["hometown_simplified"].value_counts().nlargest(20)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=top_simplified_hometowns.values, y=top_simplified_hometowns.index, palette="crest")
plt.title("Top 10 Simplified Hometowns (Last Segment)")
plt.xlabel("Number of Individuals")
plt.ylabel("Simplified Hometown")
plt.tight_layout()
plt.show()

In [5]:
# Find the resume_text for Nguyễn Thiện Nhân
target_name = "Nguyễn Thiện Nhân"

# Locate the row
target_row = df_with_resumes[df_with_resumes["name"] == target_name]

# Print raw resume text
if not target_row.empty:
    print(target_row.iloc[0]["resume_text"])
else:
    print("Person not found.")

In [6]:
parsed_data = df_with_resumes["resume_text"].apply(parse_resume_text)
parsed_df = pd.json_normalize(parsed_data)

# Drop duplicate columns before merge
parsed_df = parsed_df.loc[:, ~parsed_df.columns.duplicated()]
df_with_resumes = df_with_resumes.reset_index(drop=True)
df_with_resumes = df_with_resumes.loc[:, ~df_with_resumes.columns.duplicated()]

# Merge parsed data
#f_with_resumes = pd.concat([df_with_resumes, parsed_df], axis=1)
#df_with_resumes = df_with_resumes.loc[:, ~df_with_resumes.columns.duplicated()]


# Filter for education entries
df_edu = df_with_resumes[df_with_resumes["education"].notnull()]
# View full content of education field for sample individuals
pd.set_option("display.max_colwidth", None)
print(df_edu[["name", "education"]].head(100))



In [7]:
# Plot distribution of education levels
education_counts = df_with_resumes["education"].value_counts().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=education_counts.values, y=education_counts.index, palette="viridis")
plt.title("Distribution of Education Levels")
plt.xlabel("Number of Individuals")
plt.ylabel("Education Level")
plt.tight_layout()
plt.show()



In [8]:
# Filter for individuals with both cabinet_start_year and education
edu_by_cabinet = df_with_resumes[
    df_with_resumes["cabinet_start_year"].notnull() & 
    df_with_resumes["education"].notnull()
]

# Group by cabinet year and education, count occurrences
edu_cabinet_counts = (
    edu_by_cabinet.groupby(["cabinet_start_year", "education"])
    .size()
    .unstack(fill_value=0)
)

# Sort by year
edu_cabinet_counts = edu_cabinet_counts.sort_index()

# Plot
plt.figure(figsize=(14, 7))
edu_cabinet_counts.plot(kind="bar", stacked=True, colormap="tab20", figsize=(14, 7))
plt.title("Distribution of Education Levels by Cabinet Start Year")
plt.xlabel("Cabinet Start Year")
plt.ylabel("Number of Individuals")
plt.xticks(rotation=45)
plt.legend(title="Education Level")
plt.tight_layout()
plt.show()

In [10]:
!pip install plotly
import plotly.express as px
# Prepare data
edu_by_hometown = df_with_resumes[
    df_with_resumes["hometown_simplified"].notnull() &
    df_with_resumes["education"].notnull()
]

# Limit to top N hometowns by total individuals
top_hometowns = (
    edu_by_hometown["hometown_simplified"]
    .value_counts()
    .nlargest(15)
    .index
)

edu_filtered = edu_by_hometown[edu_by_hometown["hometown_simplified"].isin(top_hometowns)]

# Group for stacked bar
edu_counts_long = (
    edu_filtered.groupby(["hometown_simplified", "education"])
    .size()
    .reset_index(name="count")
)

# Create interactive bar chart
fig = px.bar(
    edu_counts_long,
    x="count",
    y="hometown_simplified",
    color="education",
    orientation="h",
    title="Interactive Chart: Education Level by Hometown",
    labels={"count": "Number of Individuals", "hometown_simplified": "Hometown"},
    hover_data={"count": True, "education": True}
)

fig.update_layout(barmode="stack", yaxis=dict(categoryorder="total ascending"))
fig.show()

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import pandas as pd
import folium

import geopy.geocoders
import geopy.adapters
import urllib3


# Prepare hometown counts
hometown_counts = df_with_resumes["hometown_simplified"].value_counts().reset_index()
hometown_counts.columns = ["hometown", "count"]

# Set up geocoder
geolocator = Nominatim(user_agent="vnw-hometown-mapper", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Geocode each hometown safely
def safe_geocode(place):
    try:
        return geocode(f"{place}, Vietnam")
    except Exception as e:
        print(f"Error geocoding {place}: {e}")
        return None

hometown_counts["location"] = hometown_counts["hometown"].apply(safe_geocode)
hometown_counts["latitude"] = hometown_counts["location"].apply(lambda loc: loc.latitude if loc else None)
hometown_counts["longitude"] = hometown_counts["location"].apply(lambda loc: loc.longitude if loc else None)


cache_file = "geocode_cache.csv"

if os.path.exists(cache_file):
    cache_df = pd.read_csv(cache_file)
else:
    cache_df = pd.DataFrame(columns=["hometown", "latitude", "longitude"])
    
cached_coords = {
    row["hometown"]: (row["latitude"], row["longitude"])
    for _, row in cache_df.iterrows()
}

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="vnw-hometown-mapper", timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_coords(place):
    if place in cached_coords:
        return cached_coords[place]
    try:
        location = geocode(f"{place}, Vietnam")
        if location:
            coords = (location.latitude, location.longitude)
            cached_coords[place] = coords
            return coords
    except Exception as e:
        print(f"Geocoding error for {place}: {e}")
    return (None, None)

# Apply to hometowns
hometown_counts["latitude"], hometown_counts["longitude"] = zip(*hometown_counts["hometown"].apply(get_coords))


# Update and save cache to CSV
cache_df = pd.DataFrame([
    {"hometown": place, "latitude": lat, "longitude": lon}
    for place, (lat, lon) in cached_coords.items()
    if lat is not None and lon is not None
])
cache_df.to_csv(cache_file, index=False)


# Drop missing results
hometown_geo = hometown_counts.dropna(subset=["latitude", "longitude"])

# Create map
# Create map centered on Vietnam with tighter bounds
vietnam_map = folium.Map(location=[16.5, 107.5], zoom_start=6.5, max_bounds=True)

# Restrict panning to Vietnam bounds
vietnam_map.fit_bounds([[8.2, 102.1], [23.4, 109.5]])

for _, row in hometown_geo.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=row["count"],
        popup=f'{row["hometown"]}: {row["count"]} people',
        color="blue",
        fill=True,
        fill_opacity=0.6
    ).add_to(vietnam_map)

# Save map
vietnam_map.save("vietnam_hometown_map_vn.html")