From c5b0c2f421ff92e697f4a6f66878a450fd8ee7fe Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 23 May 2024 23:27:50 +0200 Subject: [PATCH] Replace strcpy and strncpy by new inline helper function Signed-off-by: Stefan Weil --- src/api/altorenderer.cpp | 6 ++---- src/api/baseapi.cpp | 15 ++++----------- src/api/hocrrenderer.cpp | 6 ++---- src/api/lstmboxrenderer.cpp | 5 ++--- src/api/pagerenderer.cpp | 10 ++-------- src/api/pdfrenderer.cpp | 7 ++----- src/api/wordstrboxrenderer.cpp | 5 ++--- src/ccmain/ltrresultiterator.cpp | 17 ++++------------- src/ccmain/resultiterator.cpp | 6 ++---- src/ccutil/helpers.h | 11 +++++++++++ 10 files changed, 33 insertions(+), 55 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index c3fa14f4d2..4a17a24820 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -14,6 +14,7 @@ // limitations under the License. #include "errcode.h" // for ASSERT_HOST +#include "helpers.h" // for copy_string #ifdef _WIN32 # include "host.h" // windows.h for MultiByteToWideChar, ... #endif @@ -270,12 +271,9 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { alto_str << "\t\t\t\n" << "\t\t\n"; - const std::string &text = alto_str.str(); - char *result = new char[text.length() + 1]; - strcpy(result, text.c_str()); delete res_it; - return result; + return copy_string(alto_str.str()); } } // namespace tesseract diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 06f8331138..72503636c0 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -33,7 +33,7 @@ #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_ #endif // ndef DISABLED_LEGACY_ENGINE #include "errcode.h" // for ASSERT_HOST -#include "helpers.h" // for IntCastRounded, chomp_string +#include "helpers.h" // for IntCastRounded, chomp_string, copy_string #include "host.h" // for MAX_PATH #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ... #ifndef DISABLED_LEGACY_ENGINE @@ -1378,9 +1378,7 @@ char *TessBaseAPI::GetUTF8Text() { const std::unique_ptr para_text(it->GetUTF8Text(RIL_PARA)); text += para_text.get(); } while (it->Next(RIL_PARA)); - char *result = new char[text.length() + 1]; - strncpy(result, text.c_str(), text.length() + 1); - return result; + return copy_string(text); } static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) { @@ -1509,9 +1507,7 @@ char *TessBaseAPI::GetTSVText(int page_number) { #endif } - char *ret = new char[tsv_str.length() + 1]; - strcpy(ret, tsv_str.c_str()); - return ret; + return copy_string(tsv_str); } /** The 5 numbers output for each box (the usual 4 and a page number.) */ @@ -1759,10 +1755,7 @@ char *TessBaseAPI::GetOsdText(int page_number) { << "Orientation confidence: " << orient_conf << "\n" << "Script: " << script_name << "\n" << "Script confidence: " << script_conf << "\n"; - const std::string &text = stream.str(); - char *result = new char[text.length() + 1]; - strcpy(result, text.c_str()); - return result; + return copy_string(stream.str()); } #endif // ndef DISABLED_LEGACY_ENGINE diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index 6744a8780f..ea9d7cef40 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -25,6 +25,7 @@ # include "host.h" // windows.h for MultiByteToWideChar, ... #endif #include +#include "helpers.h" // for copy_string #include "tesseractclass.h" // for Tesseract namespace tesseract { @@ -480,10 +481,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { } hocr_str << " \n"; - const std::string &text = hocr_str.str(); - char *result = new char[text.length() + 1]; - strcpy(result, text.c_str()); - return result; + return copy_string(hocr_str.str()); } /********************************************************************** diff --git a/src/api/lstmboxrenderer.cpp b/src/api/lstmboxrenderer.cpp index d666b2d7b3..1514eeeda3 100644 --- a/src/api/lstmboxrenderer.cpp +++ b/src/api/lstmboxrenderer.cpp @@ -18,6 +18,7 @@ #include // for TessBaseAPI #include +#include "helpers.h" // for copy_string #include "tesseractclass.h" // for Tesseract namespace tesseract { @@ -81,10 +82,8 @@ char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) { AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str); lstm_box_str += "\n"; // end of PAGE } - char *ret = new char[lstm_box_str.length() + 1]; - strcpy(ret, lstm_box_str.c_str()); delete res_it; - return ret; + return copy_string(lstm_box_str); } /********************************************************************** diff --git a/src/api/pagerenderer.cpp b/src/api/pagerenderer.cpp index 597f86fc01..a611341628 100644 --- a/src/api/pagerenderer.cpp +++ b/src/api/pagerenderer.cpp @@ -14,6 +14,7 @@ // limitations under the License. #include "errcode.h" // for ASSERT_HOST +#include "helpers.h" // for copy_string #ifdef _WIN32 # include "host.h" // windows.h for MultiByteToWideChar, ... #endif @@ -1143,15 +1144,8 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) { const std::string &text = reading_order_str.str(); reading_order_str.str(""); - // Allocate memory for result to hold text.length() characters plus a null - // terminator Safely copy the string into result, ensuring no overflow strncpy - // does not necessarily null-terminate the destination, so do it manually - char *result = new char[text.length() + 1]; - strncpy(result, text.c_str(), text.length()); - result[text.length()] = '\0'; - delete res_it; - return result; + return copy_string(text); } } // namespace tesseract diff --git a/src/api/pdfrenderer.cpp b/src/api/pdfrenderer.cpp index e84b063a64..3f6290152f 100644 --- a/src/api/pdfrenderer.cpp +++ b/src/api/pdfrenderer.cpp @@ -22,7 +22,7 @@ #include "pdf_ttf.h" #include "tprintf.h" -#include "helpers.h" // for Swap +#include "helpers.h" // for Swap, copy_string #include #include @@ -497,10 +497,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double pdf_str << "ET\n"; // end the text object } } - const std::string &text = pdf_str.str(); - char *result = new char[text.length() + 1]; - strcpy(result, text.c_str()); - return result; + return copy_string(pdf_str.str()); } bool TessPDFRenderer::BeginDocumentHandler() { diff --git a/src/api/wordstrboxrenderer.cpp b/src/api/wordstrboxrenderer.cpp index fa8c2cd358..e4ffcedba8 100644 --- a/src/api/wordstrboxrenderer.cpp +++ b/src/api/wordstrboxrenderer.cpp @@ -18,6 +18,7 @@ #include // for TessBaseAPI #include +#include "helpers.h" // for copy_string #include "tesseractclass.h" // for Tesseract namespace tesseract { @@ -80,10 +81,8 @@ char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) { wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL wordstr_box_str += "\n"; } - char *ret = new char[wordstr_box_str.length() + 1]; - strcpy(ret, wordstr_box_str.c_str()); delete res_it; - return ret; + return copy_string(wordstr_box_str); } /********************************************************************** diff --git a/src/ccmain/ltrresultiterator.cpp b/src/ccmain/ltrresultiterator.cpp index 0073d3dd81..4ff498fa37 100644 --- a/src/ccmain/ltrresultiterator.cpp +++ b/src/ccmain/ltrresultiterator.cpp @@ -19,6 +19,7 @@ #include +#include "helpers.h" // for copy_string #include "pageres.h" #include "tesseractclass.h" @@ -76,10 +77,7 @@ char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const { } } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block()); } - int length = text.length() + 1; - char *result = new char[length]; - strncpy(result, text.c_str(), length); - return result; + return copy_string(text); } // Set the string inserted at the end of each text line. "\n" by default. @@ -310,11 +308,7 @@ char *LTRResultIterator::WordTruthUTF8Text() const { if (!HasTruthString()) { return nullptr; } - std::string truth_text = it_->word()->blamer_bundle->TruthString(); - int length = truth_text.length() + 1; - char *result = new char[length]; - strncpy(result, truth_text.c_str(), length); - return result; + return copy_string(it_->word()->blamer_bundle->TruthString()); } // Returns the null terminated UTF-8 encoded normalized OCR string for the @@ -330,10 +324,7 @@ char *LTRResultIterator::WordNormedUTF8Text() const { for (unsigned i = 0; i < best_choice->length(); ++i) { ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i)); } - auto length = ocr_text.length() + 1; - char *result = new char[length]; - strncpy(result, ocr_text.c_str(), length); - return result; + return copy_string(ocr_text); } // Returns a pointer to serialized choice lattice. diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp index c2b3d7082a..1fe4584298 100644 --- a/src/ccmain/resultiterator.cpp +++ b/src/ccmain/resultiterator.cpp @@ -20,6 +20,7 @@ #include +#include "helpers.h" // for copy_string #include "pageres.h" #include "tesseractclass.h" #include "unicharset.h" @@ -681,10 +682,7 @@ char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const { } } break; } - int length = text.length() + 1; - char *result = new char[length]; - strncpy(result, text.c_str(), length); - return result; + return copy_string(text); } std::vector>>> *ResultIterator::GetRawLSTMTimesteps() const { diff --git a/src/ccutil/helpers.h b/src/ccutil/helpers.h index f252f6a9d5..212415020b 100644 --- a/src/ccutil/helpers.h +++ b/src/ccutil/helpers.h @@ -35,6 +35,17 @@ namespace tesseract { +// Copy a std::string to a newly allocated char *. +// TODO: Remove this function once the related code has been converted +// to use std::string. +inline char *copy_string(const std::string &from) { + auto length = from.length(); + char *target_string = new char[length + 1]; + from.copy(target_string, length); + target_string[length] = '\0'; + return target_string; +} + template inline bool contains(const std::vector &data, const T &value) { return std::find(data.begin(), data.end(), value) != data.end();