In [4]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import json

# 加载数据
data_path = '../raw_data/salary_labelled_development_set.csv'
df = pd.read_csv(data_path)

# 查看数据结构
print(df.head())

     job_id                                          job_title  \
0  72000415  Financial Account - Call Center Agent - Up to 34k   
1  69481519  Aspiring Call Center Agents - Work from Home -...   
2  55838599  Production Staff Required - Afternoon & Night-...   
3  64369104                                      Payer Analyst   
4  54861511            Solicitor, Restructuring (ID: 2100013K)   

                                      job_ad_details nation_short_desc  \
0  <div><div><div>\n \n Job Opening \n \n <p>\n F...                PH   
1  <div><div>\n <div>\n <p><b>Job Opening</b></p>...                PH   
2  <p>Original Foods Baking Co. is one of New Zea...                NZ   
3  <div> </div><div> </div>The Payer Analyst indi...                PH   
4  <p>The DLA Piper team operates across more tha...               AUS   

  salary_additional_text                   y_true  
0                    NaN  17500-17500-PHP-MONTHLY  
1                    NaN  16000-16000-PHP-MONTHLY  
2 

In [5]:
def extract_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

# 提取HTML内容并转为纯文本
tqdm.pandas(desc="Extracting Text")
df['job_ad_details_clean'] = df['job_ad_details'].progress_apply(extract_text)

# 查看处理后的文本
print(df['job_ad_details_clean'].head())

Extracting Text: 100%|███████████████████████████████████████████████████████████| 2267/2267 [00:01<00:00, 1400.50it/s]

0    Job Opening Financial Account - Call Center Ag...
1    Job Opening Aspiring Call Center Agents - Work...
2    Original Foods Baking Co. is one of New Zealan...
3    The Payer Analyst individual is assigned to th...
4    The DLA Piper team operates across more than 4...
Name: job_ad_details_clean, dtype: object





In [6]:
from googletrans import Translator
translator = Translator()

def translate_text(text, dest='en'):
    try:
        translation = translator.translate(text, dest=dest)
        return translation.text
    except Exception as e:
        print(f"Error translating text: {e}")
        return text  # 若失败，返回原文本

# 注意：翻译文本可能非常耗时，建议用小批量先测试
tqdm.pandas(desc="Translating Text")
df['job_ad_details_translated'] = df['job_ad_details_clean'].progress_apply(lambda x: translate_text(x, dest='en'))

# 查看翻译后的文本
print(df['job_ad_details_translated'].head())

Translating Text:   1%|▍                                                             | 15/2267 [00:05<17:22,  2.16it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   1%|▊                                                             | 30/2267 [00:15<22:53,  1.63it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   3%|█▉                                                            | 70/2267 [00:31<25:22,  1.44it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   5%|███▏                                                         | 119/2267 [01:00<16:08,  2.22it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   6%|███▌                                                         | 134/2267 [01:10<13:06,  2.71it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   6%|███▊                                                         | 143/2267 [01:15<11:39,  3.04it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   7%|████▏                                                        | 157/2267 [01:24<23:40,  1.49it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   7%|████▌                                                        | 168/2267 [01:28<14:52,  2.35it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   8%|████▋                                                        | 172/2267 [01:29<11:32,  3.03it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:   9%|█████▌                                                       | 207/2267 [01:44<09:19,  3.68it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  11%|██████▉                                                      | 258/2267 [02:08<10:45,  3.11it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  14%|████████▎                                                    | 307/2267 [02:29<14:33,  2.24it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  15%|████████▊                                                    | 329/2267 [02:39<13:07,  2.46it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  15%|█████████▎                                                   | 344/2267 [02:45<09:48,  3.27it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  15%|█████████▎                                                   | 347/2267 [02:46<08:38,  3.71it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  16%|█████████▋                                                   | 361/2267 [02:52<10:44,  2.96it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  16%|█████████▊                                                   | 366/2267 [02:55<14:29,  2.19it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  22%|█████████████▍                                               | 500/2267 [03:59<16:36,  1.77it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  23%|██████████████                                               | 521/2267 [04:08<13:26,  2.16it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  24%|██████████████▌                                              | 540/2267 [04:17<15:57,  1.80it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  24%|██████████████▋                                              | 545/2267 [04:18<09:51,  2.91it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  26%|███████████████▋                                             | 584/2267 [04:36<09:50,  2.85it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  33%|████████████████████▎                                        | 753/2267 [06:01<07:32,  3.35it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  33%|████████████████████▎                                        | 754/2267 [06:01<07:16,  3.47it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  34%|████████████████████▊                                        | 774/2267 [06:10<08:38,  2.88it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  34%|████████████████████▉                                        | 779/2267 [06:12<08:39,  2.87it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  35%|█████████████████████▎                                       | 793/2267 [06:19<13:16,  1.85it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  35%|█████████████████████▍                                       | 797/2267 [06:21<08:24,  2.92it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  38%|███████████████████████▏                                     | 863/2267 [06:51<14:18,  1.64it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  39%|███████████████████████▌                                     | 876/2267 [06:56<06:21,  3.65it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  40%|████████████████████████▍                                    | 910/2267 [07:11<08:33,  2.64it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  40%|████████████████████████▌                                    | 913/2267 [07:12<07:38,  2.95it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  41%|████████████████████████▉                                    | 927/2267 [07:18<07:25,  3.01it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  42%|█████████████████████████▋                                   | 955/2267 [07:34<14:22,  1.52it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  43%|██████████████████████████▍                                  | 982/2267 [07:47<08:19,  2.57it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  43%|██████████████████████████▌                                  | 986/2267 [07:48<07:21,  2.90it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  44%|██████████████████████████▌                                  | 988/2267 [07:50<11:54,  1.79it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  46%|███████████████████████████▊                                | 1052/2267 [08:17<08:36,  2.35it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  48%|████████████████████████████▋                               | 1084/2267 [08:36<06:54,  2.86it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  49%|█████████████████████████████▏                              | 1104/2267 [08:48<07:22,  2.63it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  49%|█████████████████████████████▌                              | 1115/2267 [08:52<09:24,  2.04it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  50%|█████████████████████████████▊                              | 1126/2267 [09:00<15:12,  1.25it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  51%|██████████████████████████████▎                             | 1147/2267 [09:10<07:49,  2.38it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  52%|███████████████████████████████▍                            | 1186/2267 [09:26<04:43,  3.82it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  54%|████████████████████████████████▋                           | 1235/2267 [09:53<15:05,  1.14it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  55%|████████████████████████████████▊                           | 1239/2267 [09:56<10:28,  1.64it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  56%|█████████████████████████████████▎                          | 1259/2267 [10:03<05:00,  3.36it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  57%|█████████████████████████████████▉                          | 1281/2267 [10:10<05:51,  2.80it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  57%|█████████████████████████████████▉                          | 1282/2267 [10:10<05:20,  3.08it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  57%|██████████████████████████████████                          | 1285/2267 [10:12<06:20,  2.58it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  57%|██████████████████████████████████▏                         | 1293/2267 [10:15<06:09,  2.64it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  57%|██████████████████████████████████▎                         | 1296/2267 [10:16<05:01,  3.22it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  59%|███████████████████████████████████▎                        | 1336/2267 [10:32<05:14,  2.96it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  59%|███████████████████████████████████▋                        | 1348/2267 [10:36<04:32,  3.38it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  61%|████████████████████████████████████▎                       | 1372/2267 [10:45<05:39,  2.63it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  61%|████████████████████████████████████▋                       | 1386/2267 [10:50<04:11,  3.50it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  62%|█████████████████████████████████████▍                      | 1413/2267 [11:06<05:21,  2.66it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  63%|█████████████████████████████████████▋                      | 1425/2267 [11:11<04:30,  3.12it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  64%|██████████████████████████████████████▍                     | 1452/2267 [11:23<07:48,  1.74it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  65%|██████████████████████████████████████▉                     | 1469/2267 [11:29<04:53,  2.72it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  65%|███████████████████████████████████████                     | 1474/2267 [11:31<03:40,  3.59it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  66%|███████████████████████████████████████▎                    | 1486/2267 [11:36<04:56,  2.63it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  66%|███████████████████████████████████████▋                    | 1499/2267 [11:41<08:18,  1.54it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  67%|████████████████████████████████████████                    | 1514/2267 [11:49<06:03,  2.07it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  70%|█████████████████████████████████████████▊                  | 1582/2267 [12:18<05:37,  2.03it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  71%|██████████████████████████████████████████▍                 | 1604/2267 [12:28<03:45,  2.94it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  71%|██████████████████████████████████████████▌                 | 1610/2267 [12:30<03:58,  2.75it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  72%|███████████████████████████████████████████▏                | 1631/2267 [12:39<03:32,  2.99it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  72%|███████████████████████████████████████████▍                | 1640/2267 [12:43<03:29,  2.99it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  73%|███████████████████████████████████████████▌                | 1644/2267 [12:45<05:06,  2.03it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  73%|███████████████████████████████████████████▋                | 1652/2267 [12:47<03:07,  3.28it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  76%|█████████████████████████████████████████████▍              | 1719/2267 [13:20<05:46,  1.58it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  77%|█████████████████████████████████████████████▉              | 1736/2267 [13:29<06:00,  1.47it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  77%|██████████████████████████████████████████████▎             | 1748/2267 [13:36<04:52,  1.77it/s]

Error translating text: list index out of range


Translating Text:  79%|███████████████████████████████████████████████▌            | 1797/2267 [13:59<02:48,  2.80it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  84%|██████████████████████████████████████████████████          | 1893/2267 [14:48<02:11,  2.85it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  84%|██████████████████████████████████████████████████▏         | 1896/2267 [14:49<02:23,  2.58it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  84%|██████████████████████████████████████████████████▎         | 1900/2267 [14:50<01:58,  3.10it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  84%|██████████████████████████████████████████████████▍         | 1908/2267 [14:52<01:35,  3.77it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  84%|██████████████████████████████████████████████████▋         | 1914/2267 [14:54<01:28,  3.97it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  87%|███████████████████████████████████████████████████▉        | 1963/2267 [15:18<01:52,  2.70it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  87%|████████████████████████████████████████████████████▏       | 1973/2267 [15:22<01:35,  3.07it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  87%|████████████████████████████████████████████████████▎       | 1978/2267 [15:26<04:00,  1.20it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  89%|█████████████████████████████████████████████████████▋      | 2027/2267 [15:50<01:57,  2.04it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  92%|███████████████████████████████████████████████████████▏    | 2084/2267 [16:15<00:50,  3.59it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  92%|███████████████████████████████████████████████████████▎    | 2090/2267 [16:16<00:44,  3.94it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  92%|███████████████████████████████████████████████████████▍    | 2095/2267 [16:18<01:10,  2.43it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  93%|███████████████████████████████████████████████████████▋    | 2103/2267 [16:21<00:46,  3.50it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  96%|█████████████████████████████████████████████████████████▍  | 2168/2267 [16:49<00:29,  3.37it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  96%|█████████████████████████████████████████████████████████▉  | 2187/2267 [16:57<00:36,  2.19it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  97%|██████████████████████████████████████████████████████████▏ | 2199/2267 [17:01<00:21,  3.13it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  98%|██████████████████████████████████████████████████████████▊ | 2222/2267 [17:10<00:31,  1.43it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  98%|███████████████████████████████████████████████████████████ | 2231/2267 [17:14<00:14,  2.52it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  99%|███████████████████████████████████████████████████████████▍| 2246/2267 [17:24<00:11,  1.90it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text:  99%|███████████████████████████████████████████████████████████▌| 2252/2267 [17:25<00:04,  3.54it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text: 100%|███████████████████████████████████████████████████████████▊| 2259/2267 [17:28<00:02,  3.22it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType


Translating Text: 100%|████████████████████████████████████████████████████████████| 2267/2267 [17:30<00:00,  2.16it/s]

Error translating text: the JSON object must be str, bytes or bytearray, not NoneType
0    Job Opening Financial Account - Call Center Ag...
1    Job Opening Aspiring Call Center Agents - Work...
2    Original Foods Baking Co. is one of New Zealan...
3    The Payer Analyst individual is assigned to th...
4    The DLA Piper team operates across more than 4...
Name: job_ad_details_translated, dtype: object





In [7]:
# 只存储你需要的字段到JSON文件中
processed_data = df[['job_ad_details', 'job_ad_details_clean', 'job_ad_details_translated']].to_dict(orient='records')

# 存储为JSON文件
with open('processed_job_ad_details.json', 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4)

print("Data has been successfully stored in processed_job_ad_details.json")


Data has been successfully stored in processed_job_ad_details.json
