In [1]:
from pathlib import Path
import os
from tqdm.notebook import tqdm
import json
import json5
import pandas as pd

def load_lang(path:Path) -> dict:
    """
    将旧版lang文件转换为dict
    """
    res = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#"): # 跳过空行和注释
                key, value = line.split("=", 1) # 以第一个等号分割键值对
                res[key] = value
    return res


In [None]:
path = Path("./projects")
errorList = []
for version in tqdm(os.listdir(path)):
    out_dir = Path(f'patched/{version}')
    out_dir.mkdir(parents=True, exist_ok=True)

    data = [] # [{key, src, tgt}]

    if version == "1.12.2":
        version = path.joinpath(version)
        for en_file in tqdm(version.glob("**/en_us.lang"), leave=True): # 低版本用json文件做语言文件
            zh_file = en_file.parent.joinpath('zh_cn.lang')
            if not zh_file.exists():
                continue
            try:
                en_lang = load_lang(en_file)
                zh_lang = load_lang(zh_file)
            except Exception as e:
                errorList.append((en_file,zh_file,e))
                continue
            
            for key,src in en_lang.items():
                target = zh_lang.get(key, False)
                if target:
                    data.append({
                        "key":key,
                        "src":src,
                        "target": zh_lang[key]
                    })
    else:
        version = path.joinpath(version)
        for en_file in tqdm(version.glob("**/en_us.json"), leave=True): # 高版本用json文件做语言文件
            zh_file = en_file.parent.joinpath('zh_cn.json')
            if not zh_file.exists():
                continue
            try:
                with open(en_file, 'r') as en:
                    en_lang = json5.load(en)
                with open(zh_file, 'r') as zh:
                    zh_lang = json5.load(zh)
            except Exception as e:
                errorList.append((en_file,zh_file,e))
                continue
            
            for key,src in en_lang.items():
                target = zh_lang.get(key, False)
                if target:
                    data.append({
                        "key":key,
                        "src":src,
                        "target": zh_lang[key]
                    })
    df = pd.DataFrame(data)
    df.to_csv(out_dir.joinpath("data.csv"), index=False, errors='ignore')


  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [3]:
path = Path("patched")

result = pd.DataFrame()
for csv in path.glob('**/data.csv'):
    result = pd.concat((pd.read_csv(csv),result),axis=0)


In [None]:
# orgin = pd.read_csv("/fastone/users/shiny2/jobs/datas/my_data.csv", index_col=0)
# result = pd.concat((result,orgin), axis=0) # 合并其他来源的数据
print("总数据条数",result.shape)
result = result.drop_duplicates() # 去重
print("去重", result.shape)
result = result.dropna(axis=0, how='any') # 删除有空值的行
print("去空值", result.shape)

总数据条数 (761421, 3)
去重 (540230, 3)
去空值 (539955, 3)


In [5]:
test = result.sample(n=10000, replace= False)
train = result.drop(test.index).reset_index(drop=True)
print("测试集",test.shape)
print("训练集", train.shape)

测试集 (10000, 3)
训练集 (509580, 3)


In [7]:
test.to_csv('test.csv', index=False)
train.to_csv('train.csv', index=False)

# 加载原版数据


In [None]:
import requests
from bs4 import BeautifulSoup

with open("wiki.html") as f: #wiki界面分析
    page = f.read()
page = BeautifulSoup(page)

In [38]:
tables = page.find_all("table", class_="data-table")

from tqdm.notebook import tqdm
result = []
for table in tqdm(tables):
    table = table.find("tbody")
    for row in table.find_all("tr"): # 遍历每一行
        datas = row.find_all("td")
        if len(datas) == 3:
            src = datas[1].text
            target = datas[2].text
            result.append({
                "key": "-",
                "src": src.strip(),
                "target": target.strip()
            })
        elif len(datas) == 2:
            src = datas[0].text
            target = datas[1].text
            result.append({
                "key": "-",
                "src": src.strip(),
                "target": target.strip()
            })


  0%|          | 0/49 [00:00<?, ?it/s]

In [40]:
tables = page.find_all("table", class_="sortable")

from tqdm.notebook import tqdm
for table in tqdm(tables):
    table = table.find("tbody")
    for row in table.find_all("tr"): # 遍历每一行
        datas = row.find_all("td")
        if len(datas) < 2:
            continue
        src = datas[0].text
        target = datas[1].text
        result.append({
            "key": "-",
            "src": src.strip(),
            "target": target.strip()
        })

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
result = pd.DataFrame(result)
train = pd.concat([result, pd.read_csv("train.csv")], axis=0)
train.to_csv("train.csv")

In [5]:
print(pd.read_csv("train.csv").shape)
print(pd.read_csv("test.csv").shape)

(512046, 3)
(10000, 3)
