In [27]:
import os
import re
import hashlib
import pandas as pd
from html import unescape
# from sqlalchemy import create_engine

raw_path = r"C:\Users\JKK4V3PX\healthcare_job_analyze\healthcare_job\data\emed_careers_eu.csv"
df = pd.read_csv(raw_path)

In [28]:
print("Rows, Columns:", df.shape)
display(df.head(5))
print("--- Info ---")
print(df.info())
print("\n--- Missing per column ---")
print(df.isnull().sum().sort_values(ascending=False))

Rows, Columns: (39774, 8)


Unnamed: 0,category,company_name,job_description,job_title,job_type,location,post_date,salary_offered
0,Clinical Research,PPD GLOBAL LTD,"As part of our on-going growth, we are current...",Senior / Medical Writer (Regulatory),Permanent,Cambridge,4/14/2018,Competitive
1,Science,AL Solutions,Manager of Biometrics – Italy\nAL Solutions ar...,Manager of Biometrics,Permanent,Europe,4/16/2018,
2,Science,Seltek Consultants Ltd,A fantastic opportunity has arisen for an expe...,Field Service Engineer | Chromatography,Permanent,UK,4/16/2018,
3,Data Management and Statistics,Docs International UK Limited,Job Details\n:\nUtilise extensive clinical dat...,Data Manager of Project Management,Permanent,M4 Corridor,4/11/2018,On Application
4,Science,Hyper Recruitment Solutions Ltd,Hyper Recruitment Solutions are currently look...,Strategic Market Analyst,Permanent,Cambridge,4/13/2018,


--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   category         30000 non-null  object
 1   company_name     30000 non-null  object
 2   job_description  30000 non-null  object
 3   job_title        30000 non-null  object
 4   job_type         30000 non-null  object
 5   location         30000 non-null  object
 6   post_date        30000 non-null  object
 7   salary_offered   22685 non-null  object
dtypes: object(8)
memory usage: 2.4+ MB
None

--- Missing per column ---
salary_offered     17089
category            9774
job_description     9774
company_name        9774
job_title           9774
job_type            9774
location            9774
post_date           9774
dtype: int64


In [29]:
# Cell 3: 定义若干辅助函数，用于后续清理数据

# define keywords for searching
SKILL_KEYWORDS = [
    "python","sql","sas","r","cdisc","adverse","pharmacovigilance","spss",
    "machine learning","data analysis","biostatistics","clinical","gcp","cdash"
]

# 清理字符串的前后空格,并把空/NaN标准化为 None
def safe_strip(x):
    if pd.isna(x):
        return None
    s = str(x)
    s = s.strip()
    return s if s != "" else None


# 日期转换
def parse_date(s):
    try:
        return pd.to_datetime(s, errors='coerce', dayfirst=True)
    except:
        return pd.NaT


# 把location拆分成地区和国家
def parse_location(loc):
    # 依据逗号或竖线或短横拆分：最后一段通常为country
    if pd.isna(loc):
        return (None, None)
    s = str(loc).strip()
    parts = [p.strip() for p in re.split(r',|\||-', s) if p.strip()]
    if len(parts) == 0:
        return (None, None)
    if len(parts) == 1:
        # 只有一个字段：可能是 "Europe" 或 "UK" 或 "Cambridge"
        # 简单规则：若单词长度<=3并全大写，可能是国家缩写 -> 视为 country 否则 city
        token = parts[0]
        if token.isupper() and len(token) <= 3:
            return (None, token)
        # 若是常见 continent/country词
        if token.lower() in ["europe","uk","usa","us","UK","germany","france"]:
            return (None, token)
        return (token, None)
    # 多段：最后一段当作 country，其余合并为 city
    city = ", ".join(parts[:-1])
    country = parts[-1]
    return (city if city else None, country if country else None)

#拆分薪资
def parse_salary(text):
    # 返回 (min, max, currency, parsed_flag)
    if pd.isna(text):
        return (None, None, None, False)
    s = str(text).strip()
    s_lower = s.lower()
    # 常见非结构化文本，判断特殊词
    if any(term in s_lower for term in ["competitive","on application","negotiable","not disclosed","tbd"]):
        return (None, None, None, False)
    # 找货币符号或货币文本
    cur = None
    cur_search = re.search(r'€|£|\$|usd|eur|gbp', s, flags=re.IGNORECASE)
    if cur_search:
        cur = cur_search.group(0)
    # 提取所有数字串（忽略小数点分隔）
    nums = re.findall(r'\d{3,}', s.replace(',', ''))
    if len(nums) == 0:
        # 尝试找较短数字如 50k, 45k
        k_match = re.findall(r'(\d{2,3})k', s_lower)
        if k_match:
            val = int(k_match[0]) * 1000
            return (val, val, cur, True)
        return (None, None, cur, False)
    if len(nums) == 1:
        v = int(nums[0])
        return (v, v, cur, True)
    # 2个以上，取前两个为区间
    vmin = int(nums[0])
    vmax = int(nums[1])
    if vmin > vmax:
        vmin, vmax = vmax, vmin
    return (vmin, vmax, cur, True)


# 去除 HTML 标签、转义符，归一化空白，返回清洁的描述字符串。避免描述里有换行或 HTML 导致展示/统计问题。
def clean_description(text):
    if pd.isna(text):
        return ""
    t = unescape(str(text))
    # 去掉HTML标签
    t = re.sub(r'<[^>]+>', ' ', t)
    # 转换连续空白为单空格
    t = re.sub(r'\s+', ' ', t).strip()
    return t


#把描述转小写，检查关键词列表，生成一个简单的技能标签，用于后续筛选或可视化
def extract_skill_flags(text):
    t = (text or "").lower()
    found = [kw for kw in SKILL_KEYWORDS if kw in t]
    return ",".join(found) if found else None


# 生成job id
def make_job_id(row):
    # 用 company+title+date 做hash，确保稳定
    key = f"{row.get('company_name','') or ''}|{row.get('job_title','') or ''}|{row.get('post_date_parsed_str','') or ''}"
    return hashlib.md5(key.encode('utf-8')).hexdigest()


In [30]:
# Cell 4: 实际清洗

# 1. 复制一份操作，保留原始 df
df_clean = df.copy()

# 2. 统一缺失值格式并消除无意义的空格
for c in ['category','company_name','job_description','job_title','job_type','location','post_date','salary_offered']:
    if c in df_clean.columns:
        df_clean[c] = df_clean[c].apply(safe_strip)

# 3. 去重
before = df_clean.shape[0]
df_clean = df_clean.drop_duplicates()
after = df_clean.shape[0]
print(f"Removed exact duplicates: {before - after}")

# 4. 解析日期
if 'post_date' in df_clean.columns:
    df_clean['post_date_parsed'] = df_clean['post_date'].apply(parse_date)
    # 也保留一个字符串格式（便于做id）
    df_clean['post_date_parsed_str'] = df_clean['post_date_parsed'].dt.strftime('%Y-%m-%d')

# 5. 解析location
if 'location' in df_clean.columns:
    loc_parsed = df_clean['location'].apply(parse_location)
    df_clean['city'] = loc_parsed.apply(lambda x: x[0])
    df_clean['country'] = loc_parsed.apply(lambda x: x[1])

# 6. 标准化job_type
def norm_job_type(s):
    if not s: return "Unknown"
    s = s.lower()
    if 'perm' in s: return 'Permanent'
    if 'contract' in s: return 'Contract'
    if 'temp' in s or 'temporary' in s: return 'Temporary'
    if 'intern' in s or 'graduate' in s: return 'Internship'
    return 'Other'
if 'job_type' in df_clean.columns:
    df_clean['job_type_std'] = df_clean['job_type'].apply(norm_job_type)

# 7. 清理job_description
if 'job_description' in df_clean.columns:
    df_clean['job_description_clean'] = df_clean['job_description'].apply(clean_description)
    df_clean['desc_word_count'] = df_clean['job_description_clean'].apply(lambda t: len(t.split()) if t else 0)
    df_clean['skill_flags'] = df_clean['job_description_clean'].apply(extract_skill_flags)

# 8. 解析salary
if 'salary_offered' in df_clean.columns:
    sal_parsed = df_clean['salary_offered'].apply(parse_salary)
    df_clean['salary_min'] = sal_parsed.apply(lambda x: x[0])
    df_clean['salary_max'] = sal_parsed.apply(lambda x: x[1])
    df_clean['salary_currency'] = sal_parsed.apply(lambda x: x[2])
    df_clean['salary_parsed'] = sal_parsed.apply(lambda x: x[3])

# 9. 生成 job_id 并根据 job_id 去重
df_clean['job_id'] = df_clean.apply(make_job_id, axis=1)
before = df_clean.shape[0]
df_clean = df_clean.drop_duplicates(subset=['job_id'])
after = df_clean.shape[0]
print(f"Removed duplicates by job_id: {before - after}")


Removed exact duplicates: 23022


  return pd.to_datetime(s, errors='coerce', dayfirst=True)


Removed duplicates by job_id: 13108


In [31]:
df_clean = df_clean[['job_id',
                     'post_date_parsed',
                     'category',
                     'job_type_std', 
                     'job_title',                                         
                     'job_description_clean',
                     'desc_word_count',                    
                     'skill_flags',
                   
                     'company_name',
                     'city',
                     'country',

                     'salary_offered',
                     'salary_min',
                     'salary_max',
                     'salary_currency',
                     'salary_parsed'
]]


In [32]:
country_labels = ['France', 'Germany', 'Italy', 'Spain', 'Switzerland', 'UK']
df_clean['category'] = df_clean['category'].replace({'science': 'Science'})
df_clean.loc[df_clean['category'].isin(country_labels), 'category'] = None

city_to_country = {
    "Basel": "Switzerland",
    "Birmingham": "UK",
    "Cambridge": "UK",
    "cambridge": "UK",
    "Italy": "Italy",
    "italy": "Italy",
    "London": "UK",
    "M4 Corridor": "UK",
    "Manchester": "UK",
    "North West": "UK",
    "Oxford": "UK",
    "Paris": "France",
    "Scotland": "UK",
    "South East": "UK",
    "Spain": "Spain",
    "spain": "Spain",
    "Switzerland": "Switzerland",
    "switzerland": "Switzerland",
}
df_clean['country'] = df_clean['country'].fillna(df_clean['city'].map(city_to_country))

country_standard_map = {
    "France": "France",
    "france": "France",
    "Germany": "Germany",
    "germany": "Germany",
    "UK": "UK",
    "uk": "UK",
    "Europe": "Europe",  
}
df_clean['country'] = df_clean['country'].map(country_standard_map).fillna(df_clean['country'])

currency_map = {
    "$": "USD",
    "EUR": "EUR",
    "Eur": "EUR",
    "€": "EUR",
    "£": "GBP"
}
df_clean['salary_currency'] = df_clean['salary_currency'].map(currency_map)



In [33]:
print("-----isnull------")
print(df_clean.isnull().sum()) 

-----isnull------
job_id                      0
post_date_parsed            1
category                   56
job_type_std                0
job_title                   1
job_description_clean       0
desc_word_count             0
skill_flags                 1
company_name                1
city                     2055
country                     1
salary_offered            859
salary_min               3089
salary_max               3089
salary_currency          3349
salary_parsed               0
dtype: int64


In [34]:
print("-----unique------")
print(df_clean.nunique() )

-----unique------
job_id                   3644
post_date_parsed           15
category                   11
job_type_std                5
job_title                1895
job_description_clean    2910
desc_word_count           604
skill_flags                66
company_name              164
city                       18
country                     7
salary_offered            669
salary_min                 72
salary_max                 80
salary_currency             3
salary_parsed               2
dtype: int64


In [35]:
display(df_clean.head(5))
df_clean.info()

Unnamed: 0,job_id,post_date_parsed,category,job_type_std,job_title,job_description_clean,desc_word_count,skill_flags,company_name,city,country,salary_offered,salary_min,salary_max,salary_currency,salary_parsed
0,15cc819a5d9a4b298bcc5fd839bd65a3,2018-04-14,Clinical Research,Permanent,Senior / Medical Writer (Regulatory),"As part of our on-going growth, we are current...",359,"r,biostatistics,clinical",PPD GLOBAL LTD,Cambridge,UK,Competitive,,,,False
1,31c1a5eac509952eeb8cfdccbeb2fcc4,2018-04-16,Science,Permanent,Manager of Biometrics,Manager of Biometrics – Italy AL Solutions are...,291,"sas,r,clinical",AL Solutions,,Europe,,,,,False
2,7b6abf608611f7c6b97be78b5914d26c,2018-04-16,Science,Permanent,Field Service Engineer | Chromatography,A fantastic opportunity has arisen for an expe...,374,r,Seltek Consultants Ltd,,UK,,,,,False
3,63542ffb0393034e35a3f9753b274b73,2018-11-04,Data Management and Statistics,Permanent,Data Manager of Project Management,Job Details : Utilise extensive clinical data ...,279,"r,pharmacovigilance,clinical",Docs International UK Limited,M4 Corridor,UK,On Application,,,,False
4,7c9de23d13d4dd71f917385947fe5b81,2018-04-13,Science,Permanent,Strategic Market Analyst,Hyper Recruitment Solutions are currently look...,210,r,Hyper Recruitment Solutions Ltd,Cambridge,UK,,,,,False


<class 'pandas.core.frame.DataFrame'>
Index: 3644 entries, 0 to 30000
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   job_id                 3644 non-null   object        
 1   post_date_parsed       3643 non-null   datetime64[ns]
 2   category               3588 non-null   object        
 3   job_type_std           3644 non-null   object        
 4   job_title              3643 non-null   object        
 5   job_description_clean  3644 non-null   object        
 6   desc_word_count        3644 non-null   int64         
 7   skill_flags            3643 non-null   object        
 8   company_name           3643 non-null   object        
 9   city                   1589 non-null   object        
 10  country                3643 non-null   object        
 11  salary_offered         2785 non-null   object        
 12  salary_min             555 non-null    float64       
 13  salary_

In [36]:
clean_path = "cleaned_healthcare_jobs.csv"
df_clean.to_csv(clean_path, index=False)
print(f"Cleaned data saved to {clean_path}")

Cleaned data saved to cleaned_healthcare_jobs.csv
