Skip to content

Commit

Permalink
bugfix: crash at big files; fix filter function
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwillow committed Dec 5, 2023
1 parent b0ed6d1 commit 454b781
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 24 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ v0.62 现在支持拖拽文件/文件夹到程序图标上了。

v0.7 支持命令行。使用 $ ./SmartCharsetConverter --help 查看命令行参数。

v0.71 修复命令行用不了的bug。

v0.72 解决添加大文件会卡死的问题(只探测文件前100KB)。
后缀过滤模式现在支持更多的模式了(支持以`*.` `.` `空格` `|`分隔)。修复后缀过滤模式的其他问题。

# TODO

* 转换前再次检查一次字符集,已免出现加载后用户更改了字符集后转换出错的情况。
Expand Down
4 changes: 4 additions & 0 deletions src/Common/tstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,9 @@ std::wstring to_hex(std::wstring s);

std::tistream &safeGetline(std::tistream &is, std::tstring &t);

/**
* 切分字符串。
* dep填入分隔符,可以支持多种分隔符。例如"\n\t"。
*/
std::vector<std::tstring_view> Split(std::tstring_view s, const std::tstring &dep) noexcept;
void Split_UnitTest();
37 changes: 28 additions & 9 deletions src/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

using namespace std;

constexpr uint64_t tryReadSize = 100Ui64 * KB;

std::unordered_set<CharsetCode> Configuration::normalCharset = {CharsetCode::UTF8, CharsetCode::UTF8BOM,
CharsetCode::GB18030};

Expand Down Expand Up @@ -587,7 +589,7 @@ std::tuple<CharsetCode, std::unique_ptr<UChar[]>, int32_t> Core::GetEncoding(con
return {CharsetCode::EMPTY, unique_ptr<UChar[]>(new UChar[1]{L'\0'}), 0};
}

bufSize = std::min(bufSize, static_cast<int>(100 * KB));
bufSize = std::min(bufSize, static_cast<int>(tryReadSize));

auto [ucsdetResult, ucsdetConfidence] = DetectByUCSDet(buf, bufSize);

Expand Down Expand Up @@ -618,17 +620,22 @@ std::tuple<CharsetCode, std::unique_ptr<UChar[]>, int32_t> Core::GetEncoding(con

Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unordered_set<std::tstring> &filterDotExts) {
// 如果是只包括指定后缀的模式,且文件后缀不符合,则忽略掉,且不提示
if (GetConfig().filterMode == Configuration::FilterMode::ONLY_SOME_EXTANT &&
filterDotExts.find(TEXT(".") + GetExtend(filename)) == filterDotExts.end()) {
return {};
if (GetConfig().filterMode == Configuration::FilterMode::ONLY_SOME_EXTANT) {
auto ext = GetExtend(filename);
auto dotExt = TEXT(".") + tolower(ext);

if (filterDotExts.find(dotExt) == filterDotExts.end()) {
return {};
}
}

// 如果重复了
if (listFileNames.find(filename) != listFileNames.end()) {
throw runtime_error("重复添加");
}

auto [buf, bufSize] = ReadFileToBuffer(filename);
// 读入文件,只读入部分。因为读入大文件会占用太长时间。
auto [buf, bufSize] = ReadFileToBuffer(filename, tryReadSize);

// 识别字符集
auto [charsetCode, content, contentSize] = GetEncoding(buf.get(), bufSize);
Expand All @@ -649,7 +656,7 @@ Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unord
// 成功添加
listFileNames.insert(filename);

return AddItemResult{fileSize, charsetCode, Configuration::LineBreaks::UNKNOWN, L""};
return AddItemResult{false, fileSize, charsetCode, Configuration::LineBreaks::UNKNOWN, L""};
}
}
}
Expand All @@ -658,26 +665,38 @@ Core::AddItemResult Core::AddItem(const std::tstring &filename, const std::unord

auto charsetName = ToViewCharsetName(charsetCode);

// 重新读入整个文件,因为之前只读入了部分,换行符可能判断不彻底
if (bufSize < fileSize) {
std::tie(buf, bufSize) = ReadFileToBuffer(filename);
}
auto [wholeUtfStr, wholeUtfStrSize] = Decode(buf.get(), bufSize, charsetCode);

// 检查换行符
auto lineBreak = GetLineBreaks(wholeUtfStr.get(), wholeUtfStrSize);

// 到达这里不会再抛异常了

// 成功添加
listFileNames.insert(filename);

return AddItemResult{fileSize, charsetCode, lineBreak, reinterpret_cast<wchar_t *>(content.get())};
return AddItemResult{false, fileSize, charsetCode, lineBreak, reinterpret_cast<wchar_t *>(content.get())};
}

void Core::SpecifyItemCharset(int index, const std::tstring &filename, CharsetCode charsetCode) {
assert(listFileNames.find(filename) != listFileNames.end());

auto [buf, bufSize] = ReadFileToBuffer(filename);
// 读入文件,只读入部分。因为读入大文件会占用太长时间。
auto [buf, bufSize] = ReadFileToBuffer(filename, tryReadSize);

auto fileSizeStr = FileSizeToTString(GetFileSize(filename));
auto fileSize = GetFileSize(filename);
auto fileSizeStr = FileSizeToTString(fileSize);

auto charsetName = ToViewCharsetName(charsetCode);

// 重新读入整个文件,因为之前只读入了部分,换行符可能判断不彻底
if (bufSize < fileSize) {
std::tie(buf, bufSize) = ReadFileToBuffer(filename);
}
auto [wholeUtfStr, wholeUtfStrSize] = Decode(buf.get(), bufSize, charsetCode);
auto lineBreak = GetLineBreaks(wholeUtfStr.get(), wholeUtfStrSize);

Expand Down
4 changes: 3 additions & 1 deletion src/Core.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ struct Configuration {

FilterMode filterMode;
OutputTarget outputTarget;
std::tstring includeRule, excludeRule;
std::tstring includeRule = TEXT("h hpp c cpp cxx txt");
std::tstring excludeRule;
std::tstring outputDir;
CharsetCode outputCharset;
bool enableConvertLineBreaks;
Expand Down Expand Up @@ -193,6 +194,7 @@ class Core {
std::tuple<CharsetCode, std::unique_ptr<UChar[]>, int> GetEncoding(const char *buf, int bufSize) const;

struct AddItemResult {
bool isIgnore = true; // 是否应该忽略掉
uint64_t filesize;
CharsetCode srcCharset;
Configuration::LineBreaks srcLineBreak;
Expand Down
40 changes: 26 additions & 14 deletions src/DialogMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#undef min
#undef max

const std::tstring appTitle = TEXT("智能编码集转换器 v0.7 by Tom Willow");
const std::tstring appTitle = TEXT("智能编码集转换器 v0.72 by Tom Willow");

using namespace std;

Expand Down Expand Up @@ -59,7 +59,7 @@ BOOL DialogMain::OnInitDialog(CWindow wndFocus, LPARAM lInitParam) {

// 包含/排除指定后缀
SetFilterMode(core->GetConfig().filterMode);
// GetDlgItem(IDC_EDIT_INCLUDE_TEXT).SetWindowTextW(core->GetConfig().includeRule);
GetDlgItem(IDC_EDIT_INCLUDE_TEXT).SetWindowTextW(core->GetConfig().includeRule.c_str());

// target
SetOutputTarget(core->GetConfig().outputTarget);
Expand Down Expand Up @@ -193,9 +193,14 @@ std::vector<std::tstring> DialogMain::AddItems(const std::vector<std::tstring> &
break;
case Configuration::FilterMode::ONLY_SOME_EXTANT:
// 只包括指定后缀
CheckAndTraversalIncludeRule([&](const tstring &dotExt) {
filterDotExts.insert(dotExt);
});
try {
CheckAndTraversalIncludeRule([&](const tstring &dotExt) {
filterDotExts.insert(dotExt);
});
} catch (const std::runtime_error &err) {
MessageBox(to_tstring(err.what()).c_str(), TEXT("出错"), MB_OK | MB_ICONERROR);
return {};
}
break;
default:
assert(0);
Expand All @@ -207,7 +212,12 @@ std::vector<std::tstring> DialogMain::AddItems(const std::vector<std::tstring> &
auto AddItemNoException = [&](const std::tstring &filename) {
try {
Core::AddItemResult ret = core->AddItem(filename, filterDotExts);
AppendListViewItem(filename, ret.filesize, ret.srcCharset, ret.srcLineBreak, ret.strPiece);
if (ret.isIgnore) {
return;
}
PostUIFunc([filename, ret, this]() {
AppendListViewItem(filename, ret.filesize, ret.srcCharset, ret.srcLineBreak, ret.strPiece);
});
} catch (io_error_ignore) { ignored.push_back(filename); } catch (runtime_error &e) {
failed.push_back({filename, to_tstring(e.what())});
}
Expand Down Expand Up @@ -474,25 +484,27 @@ void DialogMain::CheckAndTraversalIncludeRule(std::function<void(const std::tstr
auto &extsStr = core->GetConfig().includeRule;

// 切分
auto exts = Split(extsStr, TEXT(" "));
auto exts = Split(extsStr, TEXT(" ,|"));

string filterExampleStr = "支持以下格式:\r\n*.h *.hpp *.c *.cpp *.txt\r\nh hpp c cpp "
"txt\r\nh|hpp|c|cpp\r\n(分隔符允许空格、逗号、竖线,后缀允许带*.或者不带)";

// 如果为空
if (exts.empty()) {
throw runtime_error("指定的后缀无效\r\n\r\n例子:*.h *.hpp *.c *.cpp *.txt");
throw runtime_error("没有指定要过滤的后缀\r\n\r\n" + filterExampleStr);
}

// 逐个检查
for (auto ext : exts) {
tstring extStr(ext);
wstring pattern = TEXT(R"(\*(\.\w+))"); // 匹配 *.xxx 的正则
for (auto s : exts) {
tstring extStr(s);
wstring pattern = TEXT(R"((\*\.|\.|)(\w+))"); // 匹配*.xxx/.xxx/xxx的正则
wregex r(pattern);
wsmatch results;
if (regex_match(extStr, results, r) == false) {
throw runtime_error("指定的后缀无效:" + to_string(extStr) +
"\r\n\r\n例子: * .h * .hpp * .c * .cpp * .txt");
throw runtime_error("指定的后缀过滤器无效:" + to_string(extStr) + "\r\n\r\n" + filterExampleStr);
}

fn(results.str(1));
fn(tolower(TEXT(".") + results.str(2)));
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/DialogMain.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ class DialogMain : public CDialogImpl<DialogMain> {
NOTIFY_HANDLER(IDC_LISTVIEW, NM_RCLICK, OnNMRclickListview)
COMMAND_ID_HANDLER(ID_OPEN_WITH_NOTEPAD, OnOpenWithNotepad)
COMMAND_ID_HANDLER(ID_REMOVE_ITEM, OnRemoveItem)

// 指定原编码
COMMAND_RANGE_HANDLER(SPECIFY_ORIGIN_CHARSET_ID_START, SPECIFY_ORIGIN_CHARSET_ID_END, OnSpecifyOriginCharset)

COMMAND_HANDLER(IDC_EDIT_INCLUDE_TEXT, EN_CHANGE, OnEnChangeEditIncludeText)
Expand All @@ -153,6 +155,10 @@ class DialogMain : public CDialogImpl<DialogMain> {
LRESULT OnBnClickedRadioUtf8bom(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL & /*bHandled*/);
LRESULT OnBnClickedRadioGb18030(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL & /*bHandled*/);

/**
* 检查 include过滤器。
* @exception runtime_error 过滤器字符串不合法
*/
void CheckAndTraversalIncludeRule(std::function<void(const std::tstring &dotExt)> fn);

LRESULT OnBnClickedRadioOther(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL & /*bHandled*/);
Expand Down

0 comments on commit 454b781

Please sign in to comment.