From 454b7816359fdd56fd7c927896f5acc66ad3b562 Mon Sep 17 00:00:00 2001 From: tomwillow Date: Tue, 5 Dec 2023 22:20:08 +0800 Subject: [PATCH] bugfix: crash at big files; fix filter function --- README.md | 5 +++++ src/Common/tstring.h | 4 ++++ src/Core.cpp | 37 ++++++++++++++++++++++++++++--------- src/Core.h | 4 +++- src/DialogMain.cpp | 40 ++++++++++++++++++++++++++-------------- src/DialogMain.h | 6 ++++++ 6 files changed, 72 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index d8b2f98..6c47fa5 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,11 @@ v0.62 现在支持拖拽文件/文件夹到程序图标上了。 v0.7 支持命令行。使用 $ ./SmartCharsetConverter --help 查看命令行参数。 +v0.71 修复命令行用不了的bug。 + +v0.72 解决添加大文件会卡死的问题(只探测文件前100KB)。 + 后缀过滤模式现在支持更多的模式了(支持以`*.` `.` `空格` `|`分隔)。修复后缀过滤模式的其他问题。 + # TODO * 转换前再次检查一次字符集,已免出现加载后用户更改了字符集后转换出错的情况。 diff --git a/src/Common/tstring.h b/src/Common/tstring.h index c8a2b27..f51378a 100644 --- a/src/Common/tstring.h +++ b/src/Common/tstring.h @@ -108,5 +108,9 @@ std::wstring to_hex(std::wstring s); std::tistream &safeGetline(std::tistream &is, std::tstring &t); +/** + * 切分字符串。 + * dep填入分隔符,可以支持多种分隔符。例如"\n\t"。 + */ std::vector Split(std::tstring_view s, const std::tstring &dep) noexcept; void Split_UnitTest(); \ No newline at end of file diff --git a/src/Core.cpp b/src/Core.cpp index 202a611..db58bed 100644 --- a/src/Core.cpp +++ b/src/Core.cpp @@ -13,6 +13,8 @@ using namespace std; +constexpr uint64_t tryReadSize = 100Ui64 * KB; + std::unordered_set Configuration::normalCharset = {CharsetCode::UTF8, CharsetCode::UTF8BOM, CharsetCode::GB18030}; @@ -587,7 +589,7 @@ std::tuple, int32_t> Core::GetEncoding(con return {CharsetCode::EMPTY, unique_ptr(new UChar[1]{L'\0'}), 0}; } - bufSize = std::min(bufSize, static_cast(100 * KB)); + bufSize = std::min(bufSize, static_cast(tryReadSize)); auto [ucsdetResult, ucsdetConfidence] = DetectByUCSDet(buf, bufSize); @@ -618,9 +620,13 @@ std::tuple, int32_t> Core::GetEncoding(con Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unordered_set &filterDotExts) { // 如果是只包括指定后缀的模式,且文件后缀不符合,则忽略掉,且不提示 - if (GetConfig().filterMode == Configuration::FilterMode::ONLY_SOME_EXTANT && - filterDotExts.find(TEXT(".") + GetExtend(filename)) == filterDotExts.end()) { - return {}; + if (GetConfig().filterMode == Configuration::FilterMode::ONLY_SOME_EXTANT) { + auto ext = GetExtend(filename); + auto dotExt = TEXT(".") + tolower(ext); + + if (filterDotExts.find(dotExt) == filterDotExts.end()) { + return {}; + } } // 如果重复了 @@ -628,7 +634,8 @@ Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unord throw runtime_error("重复添加"); } - auto [buf, bufSize] = ReadFileToBuffer(filename); + // 读入文件,只读入部分。因为读入大文件会占用太长时间。 + auto [buf, bufSize] = ReadFileToBuffer(filename, tryReadSize); // 识别字符集 auto [charsetCode, content, contentSize] = GetEncoding(buf.get(), bufSize); @@ -649,7 +656,7 @@ Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unord // 成功添加 listFileNames.insert(filename); - return AddItemResult{fileSize, charsetCode, Configuration::LineBreaks::UNKNOWN, L""}; + return AddItemResult{false, fileSize, charsetCode, Configuration::LineBreaks::UNKNOWN, L""}; } } } @@ -658,7 +665,13 @@ Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unord auto charsetName = ToViewCharsetName(charsetCode); + // 重新读入整个文件,因为之前只读入了部分,换行符可能判断不彻底 + if (bufSize < fileSize) { + std::tie(buf, bufSize) = ReadFileToBuffer(filename); + } auto [wholeUtfStr, wholeUtfStrSize] = Decode(buf.get(), bufSize, charsetCode); + + // 检查换行符 auto lineBreak = GetLineBreaks(wholeUtfStr.get(), wholeUtfStrSize); // 到达这里不会再抛异常了 @@ -666,18 +679,24 @@ Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unord // 成功添加 listFileNames.insert(filename); - return AddItemResult{fileSize, charsetCode, lineBreak, reinterpret_cast(content.get())}; + return AddItemResult{false, fileSize, charsetCode, lineBreak, reinterpret_cast(content.get())}; } void Core::SpecifyItemCharset(int index, const std::tstring &filename, CharsetCode charsetCode) { assert(listFileNames.find(filename) != listFileNames.end()); - auto [buf, bufSize] = ReadFileToBuffer(filename); + // 读入文件,只读入部分。因为读入大文件会占用太长时间。 + auto [buf, bufSize] = ReadFileToBuffer(filename, tryReadSize); - auto fileSizeStr = FileSizeToTString(GetFileSize(filename)); + auto fileSize = GetFileSize(filename); + auto fileSizeStr = FileSizeToTString(fileSize); auto charsetName = ToViewCharsetName(charsetCode); + // 重新读入整个文件,因为之前只读入了部分,换行符可能判断不彻底 + if (bufSize < fileSize) { + std::tie(buf, bufSize) = ReadFileToBuffer(filename); + } auto [wholeUtfStr, wholeUtfStrSize] = Decode(buf.get(), bufSize, charsetCode); auto lineBreak = GetLineBreaks(wholeUtfStr.get(), wholeUtfStrSize); diff --git a/src/Core.h b/src/Core.h index 09f135b..9c44405 100644 --- a/src/Core.h +++ b/src/Core.h @@ -127,7 +127,8 @@ struct Configuration { FilterMode filterMode; OutputTarget outputTarget; - std::tstring includeRule, excludeRule; + std::tstring includeRule = TEXT("h hpp c cpp cxx txt"); + std::tstring excludeRule; std::tstring outputDir; CharsetCode outputCharset; bool enableConvertLineBreaks; @@ -193,6 +194,7 @@ class Core { std::tuple, int> GetEncoding(const char *buf, int bufSize) const; struct AddItemResult { + bool isIgnore = true; // 是否应该忽略掉 uint64_t filesize; CharsetCode srcCharset; Configuration::LineBreaks srcLineBreak; diff --git a/src/DialogMain.cpp b/src/DialogMain.cpp index f37f058..52e51b1 100644 --- a/src/DialogMain.cpp +++ b/src/DialogMain.cpp @@ -15,7 +15,7 @@ #undef min #undef max -const std::tstring appTitle = TEXT("智能编码集转换器 v0.7 by Tom Willow"); +const std::tstring appTitle = TEXT("智能编码集转换器 v0.72 by Tom Willow"); using namespace std; @@ -59,7 +59,7 @@ BOOL DialogMain::OnInitDialog(CWindow wndFocus, LPARAM lInitParam) { // 包含/排除指定后缀 SetFilterMode(core->GetConfig().filterMode); - // GetDlgItem(IDC_EDIT_INCLUDE_TEXT).SetWindowTextW(core->GetConfig().includeRule); + GetDlgItem(IDC_EDIT_INCLUDE_TEXT).SetWindowTextW(core->GetConfig().includeRule.c_str()); // target SetOutputTarget(core->GetConfig().outputTarget); @@ -193,9 +193,14 @@ std::vector DialogMain::AddItems(const std::vector & break; case Configuration::FilterMode::ONLY_SOME_EXTANT: // 只包括指定后缀 - CheckAndTraversalIncludeRule([&](const tstring &dotExt) { - filterDotExts.insert(dotExt); - }); + try { + CheckAndTraversalIncludeRule([&](const tstring &dotExt) { + filterDotExts.insert(dotExt); + }); + } catch (const std::runtime_error &err) { + MessageBox(to_tstring(err.what()).c_str(), TEXT("出错"), MB_OK | MB_ICONERROR); + return {}; + } break; default: assert(0); @@ -207,7 +212,12 @@ std::vector DialogMain::AddItems(const std::vector & auto AddItemNoException = [&](const std::tstring &filename) { try { Core::AddItemResult ret = core->AddItem(filename, filterDotExts); - AppendListViewItem(filename, ret.filesize, ret.srcCharset, ret.srcLineBreak, ret.strPiece); + if (ret.isIgnore) { + return; + } + PostUIFunc([filename, ret, this]() { + AppendListViewItem(filename, ret.filesize, ret.srcCharset, ret.srcLineBreak, ret.strPiece); + }); } catch (io_error_ignore) { ignored.push_back(filename); } catch (runtime_error &e) { failed.push_back({filename, to_tstring(e.what())}); } @@ -474,25 +484,27 @@ void DialogMain::CheckAndTraversalIncludeRule(std::functionGetConfig().includeRule; // 切分 - auto exts = Split(extsStr, TEXT(" ")); + auto exts = Split(extsStr, TEXT(" ,|")); + + string filterExampleStr = "支持以下格式:\r\n*.h *.hpp *.c *.cpp *.txt\r\nh hpp c cpp " + "txt\r\nh|hpp|c|cpp\r\n(分隔符允许空格、逗号、竖线,后缀允许带*.或者不带)"; // 如果为空 if (exts.empty()) { - throw runtime_error("指定的后缀无效。\r\n\r\n例子:*.h *.hpp *.c *.cpp *.txt"); + throw runtime_error("没有指定要过滤的后缀。\r\n\r\n" + filterExampleStr); } // 逐个检查 - for (auto ext : exts) { - tstring extStr(ext); - wstring pattern = TEXT(R"(\*(\.\w+))"); // 匹配 *.xxx 的正则 + for (auto s : exts) { + tstring extStr(s); + wstring pattern = TEXT(R"((\*\.|\.|)(\w+))"); // 匹配*.xxx/.xxx/xxx的正则 wregex r(pattern); wsmatch results; if (regex_match(extStr, results, r) == false) { - throw runtime_error("指定的后缀无效:" + to_string(extStr) + - "。\r\n\r\n例子: * .h * .hpp * .c * .cpp * .txt"); + throw runtime_error("指定的后缀过滤器无效:" + to_string(extStr) + "\r\n\r\n" + filterExampleStr); } - fn(results.str(1)); + fn(tolower(TEXT(".") + results.str(2))); } } diff --git a/src/DialogMain.h b/src/DialogMain.h index 39e800a..684b58e 100644 --- a/src/DialogMain.h +++ b/src/DialogMain.h @@ -132,6 +132,8 @@ class DialogMain : public CDialogImpl { NOTIFY_HANDLER(IDC_LISTVIEW, NM_RCLICK, OnNMRclickListview) COMMAND_ID_HANDLER(ID_OPEN_WITH_NOTEPAD, OnOpenWithNotepad) COMMAND_ID_HANDLER(ID_REMOVE_ITEM, OnRemoveItem) + + // 指定原编码 COMMAND_RANGE_HANDLER(SPECIFY_ORIGIN_CHARSET_ID_START, SPECIFY_ORIGIN_CHARSET_ID_END, OnSpecifyOriginCharset) COMMAND_HANDLER(IDC_EDIT_INCLUDE_TEXT, EN_CHANGE, OnEnChangeEditIncludeText) @@ -153,6 +155,10 @@ class DialogMain : public CDialogImpl { LRESULT OnBnClickedRadioUtf8bom(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL & /*bHandled*/); LRESULT OnBnClickedRadioGb18030(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL & /*bHandled*/); + /** + * 检查 include过滤器。 + * @exception runtime_error 过滤器字符串不合法 + */ void CheckAndTraversalIncludeRule(std::function fn); LRESULT OnBnClickedRadioOther(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL & /*bHandled*/);