diff --git a/README.md b/README.md index 2b26962..bd52d82 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # [ztracy](https://github.com/zig-gamedev/ztracy) -Performance markers for [Tracy 0.12.2](https://github.com/wolfpld/tracy) in Zig +Performance markers for [Tracy 0.13.0](https://github.com/wolfpld/tracy) in Zig Initial Zig bindings created by [Martin Wickham](https://github.com/SpexGuy/Zig-Tracy) diff --git a/libs/tracy/TracyClient.cpp b/libs/tracy/TracyClient.cpp index 6224f48..8e66975 100644 --- a/libs/tracy/TracyClient.cpp +++ b/libs/tracy/TracyClient.cpp @@ -32,27 +32,11 @@ #include "client/TracyOverride.cpp" #include "client/TracyKCore.cpp" -#if defined(TRACY_HAS_CALLSTACK) -# if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 -# include "libbacktrace/alloc.cpp" -# include "libbacktrace/dwarf.cpp" -# include "libbacktrace/fileline.cpp" -# include "libbacktrace/mmapio.cpp" -# include "libbacktrace/posix.cpp" -# include "libbacktrace/sort.cpp" -# include "libbacktrace/state.cpp" -# if TRACY_HAS_CALLSTACK == 4 -# include "libbacktrace/macho.cpp" -# else -# include "libbacktrace/elf.cpp" -# endif -# include "common/TracyStackFrames.cpp" -# endif +#ifdef TRACY_ROCPROF +# include "client/TracyRocprof.cpp" #endif - #ifdef _MSC_VER # pragma comment(lib, "ws2_32.lib") -# pragma comment(lib, "dbghelp.lib") # pragma comment(lib, "advapi32.lib") # pragma comment(lib, "user32.lib") # pragma warning(pop) diff --git a/libs/tracy/client/TracyCallstack.cpp b/libs/tracy/client/TracyCallstack.cpp index bd32906..7ab6b1c 100644 --- a/libs/tracy/client/TracyCallstack.cpp +++ b/libs/tracy/client/TracyCallstack.cpp @@ -24,15 +24,33 @@ # pragma warning( disable : 4091 ) # endif # include +# pragma comment( lib, "dbghelp.lib" ) # ifdef _MSC_VER # pragma warning( pop ) # endif -#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 +#elif defined(TRACY_USE_LIBBACKTRACE) + # include "../libbacktrace/backtrace.hpp" # include # include # include # include + +// Implementation files +# include "../libbacktrace/alloc.cpp" +# include "../libbacktrace/dwarf.cpp" +# include "../libbacktrace/fileline.cpp" +# include "../libbacktrace/mmapio.cpp" +# include "../libbacktrace/posix.cpp" +# include "../libbacktrace/sort.cpp" +# include "../libbacktrace/state.cpp" +# if TRACY_HAS_CALLSTACK == 4 +# include "../libbacktrace/macho.cpp" +# else +# include "../libbacktrace/elf.cpp" +# endif +# include "../common/TracyStackFrames.cpp" + #elif TRACY_HAS_CALLSTACK == 5 # include # include @@ -53,7 +71,7 @@ extern "C" }; #endif -#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 5 || TRACY_HAS_CALLSTACK == 6 +#if defined(TRACY_USE_LIBBACKTRACE) || TRACY_HAS_CALLSTACK == 5 // If you want to use your own demangling functionality (e.g. for another language), // define TRACY_DEMANGLE and provide your own implementation of the __tracy_demangle // function. The input parameter is a function name. The demangle function must @@ -91,94 +109,147 @@ extern "C" const char* ___tracy_demangle( const char* mangled ) #endif #endif -#if TRACY_HAS_CALLSTACK == 3 -# define TRACY_USE_IMAGE_CACHE +#if defined(TRACY_USE_LIBBACKTRACE) && TRACY_HAS_CALLSTACK != 4 // dl_iterate_phdr is required for the current image cache. Need to move it to libbacktrace? +# define TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE # include #endif namespace tracy { -#ifdef TRACY_USE_IMAGE_CACHE +static bool IsKernelAddress(uint64_t addr) { + return (addr >> 63) != 0; +} + +void DestroyImageEntry( ImageEntry& entry ) +{ + tracy_free( entry.m_path ); + tracy_free( entry.m_name ); +} + +class ImageCache +{ +public: + + ImageCache( size_t imageCacheCapacity = 512 ) + : m_images( imageCacheCapacity ) + { + } + + ~ImageCache() + { + Clear(); + } + + ImageEntry* AddEntry( const ImageEntry& entry ) + { + if( m_sorted ) m_sorted = m_images.empty() || ( entry.m_startAddress < m_images.back().m_startAddress ); + ImageEntry* newEntry = m_images.push_next(); + *newEntry = entry; + return newEntry; + } + + const ImageEntry* GetImageForAddress( uint64_t address ) + { + Sort(); + + auto it = std::lower_bound( m_images.begin(), m_images.end(), address, + []( const ImageEntry& lhs, const uint64_t rhs ) { return lhs.m_startAddress > rhs; } ); + + if( it != m_images.end() && address < it->m_endAddress ) + { + return it; + } + return nullptr; + } + + void Sort() + { + if( m_sorted ) return; + + std::sort( m_images.begin(), m_images.end(), + []( const ImageEntry& lhs, const ImageEntry& rhs ) { return lhs.m_startAddress > rhs.m_startAddress; } ); + m_sorted = true; + } + + void Clear() + { + for( ImageEntry& entry : m_images ) + { + DestroyImageEntry( entry ); + } + + m_sorted = true; + m_images.clear(); + } + + bool ContainsImage( uint64_t startAddress ) const + { + return std::any_of( m_images.begin(), m_images.end(), [startAddress]( const ImageEntry& entry ) { return startAddress == entry.m_startAddress; } ); + } +protected: + tracy::FastVector m_images; + bool m_sorted = true; +}; + +#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE // when we have access to dl_iterate_phdr(), we can build a cache of address ranges to image paths // so we can quickly determine which image an address falls into. // We refresh this cache only when we hit an address that doesn't fall into any known range. -class ImageCache +class ImageCacheDlIteratePhdr : public ImageCache { public: - struct ImageEntry - { - void* m_startAddress = nullptr; - void* m_endAddress = nullptr; - char* m_name = nullptr; - }; - ImageCache() - : m_images( 512 ) + ImageCacheDlIteratePhdr() { Refresh(); } - ~ImageCache() + ~ImageCacheDlIteratePhdr() { - Clear(); } - const ImageEntry* GetImageForAddress( void* address ) + const ImageEntry* GetImageForAddress( uint64_t address ) { - const ImageEntry* entry = GetImageForAddressImpl( address ); + const ImageEntry* entry = ImageCache::GetImageForAddress( address ); if( !entry ) { Refresh(); - return GetImageForAddressImpl( address ); + return ImageCache::GetImageForAddress( address ); } return entry; } private: - tracy::FastVector m_images; bool m_updated = false; bool m_haveMainImageName = false; static int Callback( struct dl_phdr_info* info, size_t size, void* data ) { - ImageCache* cache = reinterpret_cast( data ); + ImageCacheDlIteratePhdr* cache = reinterpret_cast( data ); - const auto startAddress = reinterpret_cast( info->dlpi_addr ); - if( cache->Contains( startAddress ) ) return 0; + const auto startAddress = static_cast( info->dlpi_addr ); + if( cache->ContainsImage( startAddress ) ) return 0; const uint32_t headerCount = info->dlpi_phnum; assert( headerCount > 0); - const auto endAddress = reinterpret_cast( info->dlpi_addr + + const auto endAddress = static_cast( info->dlpi_addr + info->dlpi_phdr[info->dlpi_phnum - 1].p_vaddr + info->dlpi_phdr[info->dlpi_phnum - 1].p_memsz); - ImageEntry* image = cache->m_images.push_next(); - image->m_startAddress = startAddress; - image->m_endAddress = endAddress; + ImageEntry image{}; + image.m_startAddress = startAddress; + image.m_endAddress = endAddress; // the base executable name isn't provided when iterating with dl_iterate_phdr, // we will have to patch the executable image name outside this callback - if( info->dlpi_name && info->dlpi_name[0] != '\0' ) - { - size_t sz = strlen( info->dlpi_name ) + 1; - image->m_name = (char*)tracy_malloc( sz ); - memcpy( image->m_name, info->dlpi_name, sz ); - } - else - { - image->m_name = nullptr; - } + image.m_name = info->dlpi_name && info->dlpi_name[0] != '\0' ? CopyStringFast( info->dlpi_name ) : nullptr; + cache->AddEntry( image ); cache->m_updated = true; return 0; } - bool Contains( void* startAddress ) const - { - return std::any_of( m_images.begin(), m_images.end(), [startAddress]( const ImageEntry& entry ) { return startAddress == entry.m_startAddress; } ); - } - void Refresh() { m_updated = false; @@ -186,9 +257,7 @@ class ImageCache if( m_updated ) { - std::sort( m_images.begin(), m_images.end(), - []( const ImageEntry& lhs, const ImageEntry& rhs ) { return lhs.m_startAddress > rhs.m_startAddress; } ); - + Sort(); // patch the main executable image name here, as calling dl_* functions inside the dl_iterate_phdr callback might cause deadlocks UpdateMainImageName(); } @@ -223,31 +292,45 @@ class ImageCache m_haveMainImageName = true; } - - const ImageEntry* GetImageForAddressImpl( void* address ) const + void Clear() { - auto it = std::lower_bound( m_images.begin(), m_images.end(), address, - []( const ImageEntry& lhs, const void* rhs ) { return lhs.m_startAddress > rhs; } ); - - if( it != m_images.end() && address < it->m_endAddress ) - { - return it; - } - return nullptr; + ImageCache::Clear(); + m_haveMainImageName = false; } +}; +using UserlandImageCache = ImageCacheDlIteratePhdr; +#else +using UserlandImageCache = ImageCache; +#endif //#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE - void Clear() +static UserlandImageCache* s_imageCache; +static ImageCache* s_krnlCache; + +void CreateImageCaches() +{ + assert( s_imageCache == nullptr && s_krnlCache == nullptr ); + s_imageCache = new ( tracy_malloc( sizeof( UserlandImageCache ) ) ) UserlandImageCache(); + s_krnlCache = new ( tracy_malloc( sizeof( ImageCache ) ) ) ImageCache(); +} + +void DestroyImageCaches() +{ + if( s_krnlCache != nullptr ) { - for( ImageEntry& entry : m_images ) - { - tracy_free( entry.m_name ); - } + s_krnlCache->~ImageCache(); + tracy_free( s_krnlCache ); + s_krnlCache = nullptr; + } - m_images.clear(); - m_haveMainImageName = false; + if( s_imageCache != nullptr ) + { + s_imageCache->~UserlandImageCache(); + tracy_free( s_imageCache ); + s_imageCache = nullptr; } -}; -#endif //#ifdef TRACY_USE_IMAGE_CACHE + +} + // when "TRACY_SYMBOL_OFFLINE_RESOLVE" is set, instead of fully resolving symbols at runtime, // simply resolve the offset and image name (which will be enough the resolving to be done offline) @@ -290,26 +373,6 @@ extern "C" } } -struct ModuleCache -{ - uint64_t start; - uint64_t end; - char* name; -}; - -static FastVector* s_modCache; - - -struct KernelDriver -{ - uint64_t addr; - const char* mod; - const char* path; -}; - -KernelDriver* s_krnlCache = nullptr; -size_t s_krnlCacheCnt; - void InitCallstackCritical() { ___tracy_RtlWalkFrameChainPtr = (___tracy_t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" ); @@ -343,75 +406,57 @@ DWORD64 DbgHelpLoadSymbolsForModule( const char* imageName, uint64_t baseOfDll, return SymLoadModuleEx( GetCurrentProcess(), nullptr, imageName, nullptr, baseOfDll, bllSize, nullptr, 0 ); } -ModuleCache* LoadSymbolsForModuleAndCache( const char* imageName, uint32_t imageNameLength, uint64_t baseOfDll, uint32_t dllSize ) +char* FormatImageName( const char* imageName, uint32_t imageNameLength ) { - DbgHelpLoadSymbolsForModule( imageName, baseOfDll, dllSize ); - - ModuleCache* cachedModule = s_modCache->push_next(); - cachedModule->start = baseOfDll; - cachedModule->end = baseOfDll + dllSize; - // when doing offline symbol resolution, we must store the full path of the dll for the resolving to work if( s_shouldResolveSymbolsOffline ) { - cachedModule->name = (char*)tracy_malloc_fast(imageNameLength + 1); - memcpy(cachedModule->name, imageName, imageNameLength); - cachedModule->name[imageNameLength] = '\0'; + return CopyStringFast( imageName, imageNameLength ); } else { - auto ptr = imageName + imageNameLength; - while (ptr > imageName && *ptr != '\\' && *ptr != '/') ptr--; - if (ptr > imageName) ptr++; + const char* ptr = imageName + imageNameLength; + while( ptr > imageName && *ptr != '\\' && *ptr != '/' ) ptr--; + if( ptr > imageName ) ptr++; const auto namelen = imageName + imageNameLength - ptr; - cachedModule->name = (char*)tracy_malloc_fast(namelen + 3); - cachedModule->name[0] = '['; - memcpy(cachedModule->name + 1, ptr, namelen); - cachedModule->name[namelen + 1] = ']'; - cachedModule->name[namelen + 2] = '\0'; - } - return cachedModule; + char* alloc = (char*)tracy_malloc_fast( namelen + 3 ); + alloc[0] = '['; + memcpy( alloc + 1, ptr, namelen ); + alloc[namelen + 1] = ']'; + alloc[namelen + 2] = '\0'; + return alloc; + } } -void InitCallstack() +ImageEntry* CacheModuleInfo( const char* imagePath, uint32_t imageNameLength, uint64_t baseOfDll, uint32_t dllSize ) { -#ifndef TRACY_SYMBOL_OFFLINE_RESOLVE - s_shouldResolveSymbolsOffline = ShouldResolveSymbolsOffline(); -#endif //#ifndef TRACY_SYMBOL_OFFLINE_RESOLVE - if( s_shouldResolveSymbolsOffline ) - { - TracyDebug("TRACY: enabling offline symbol resolving!\n"); - } + ImageEntry moduleEntry = {}; + moduleEntry.m_startAddress = baseOfDll; + moduleEntry.m_endAddress = baseOfDll + dllSize; + moduleEntry.m_path = CopyStringFast( imagePath, imageNameLength ); + moduleEntry.m_name = FormatImageName( imagePath, imageNameLength ); - DbgHelpInit(); - -#ifdef TRACY_DBGHELP_LOCK - DBGHELP_LOCK; -#endif + return s_imageCache->AddEntry( moduleEntry ); +} - // use TRACY_NO_DBGHELP_INIT_LOAD=1 to disable preloading of driver - // and process module symbol loading at startup time - they will be loaded on demand later - // Sometimes this process can take a very long time and prevent resolving callstack frames - // symbols during that time. - const char* noInitLoadEnv = GetEnvVar( "TRACY_NO_DBGHELP_INIT_LOAD" ); - const bool initTimeModuleLoad = !( noInitLoadEnv && noInitLoadEnv[0] == '1' ); - if ( !initTimeModuleLoad ) - { - TracyDebug("TRACY: skipping init time dbghelper module load\n"); - } +ImageEntry* LoadSymbolsForModuleAndCache( const char* imagePath, uint32_t imageNameLength, uint64_t baseOfDll, uint32_t dllSize ) +{ + DbgHelpLoadSymbolsForModule( imagePath, baseOfDll, dllSize ); + return CacheModuleInfo( imagePath, imageNameLength, baseOfDll, dllSize ); +} +static void CacheProcessDrivers() +{ DWORD needed; LPVOID dev[4096]; - if( initTimeModuleLoad && EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 ) + if( EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 ) { char windir[MAX_PATH]; if( !GetWindowsDirectoryA( windir, sizeof( windir ) ) ) memcpy( windir, "c:\\windows", 11 ); const auto windirlen = strlen( windir ); const auto sz = needed / sizeof( LPVOID ); - s_krnlCache = (KernelDriver*)tracy_malloc( sizeof(KernelDriver) * sz ); - int cnt = 0; for( size_t i=0; i", 2 ); - s_krnlCache[cnt] = KernelDriver { (uint64_t)dev[i], buf }; + + ImageEntry kernelDriver{}; + kernelDriver.m_startAddress = (uint64_t)dev[i]; + kernelDriver.m_endAddress = 0; + kernelDriver.m_name = buf; + kernelDriver.m_path = nullptr; const auto len = GetDeviceDriverFileNameA( dev[i], fn, sizeof( fn ) ); if( len != 0 ) @@ -438,27 +488,23 @@ void InitCallstack() } DbgHelpLoadSymbolsForModule( path, (DWORD64)dev[i], 0 ); - - const auto psz = strlen( path ); - auto pptr = (char*)tracy_malloc_fast( psz+1 ); - memcpy( pptr, path, psz ); - pptr[psz] = '\0'; - s_krnlCache[cnt].path = pptr; + + kernelDriver.m_path = CopyString( path ); } - cnt++; + s_krnlCache->AddEntry(kernelDriver); } } - s_krnlCacheCnt = cnt; - std::sort( s_krnlCache, s_krnlCache + s_krnlCacheCnt, []( const KernelDriver& lhs, const KernelDriver& rhs ) { return lhs.addr > rhs.addr; } ); + s_krnlCache->Sort(); } +} - s_modCache = (FastVector*)tracy_malloc( sizeof( FastVector ) ); - new(s_modCache) FastVector( 512 ); - +static void CacheProcessModules() +{ + DWORD needed; HANDLE proc = GetCurrentProcess(); HMODULE mod[1024]; - if( initTimeModuleLoad && EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) + if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) { const auto sz = needed / sizeof( HMODULE ); for( size_t i=0; i> 63 != 0 ); + assert( IsKernelAddress( addr ) ); if( !s_krnlCache ) return nullptr; - auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } ); - if( it == s_krnlCache + s_krnlCacheCnt ) return nullptr; - return it->path; + const ImageEntry* imageEntry = s_krnlCache->GetImageForAddress( addr ); + if( imageEntry ) return imageEntry->m_path; + return nullptr; } struct ModuleNameAndBaseAddress @@ -534,51 +616,38 @@ struct ModuleNameAndBaseAddress ModuleNameAndBaseAddress GetModuleNameAndPrepareSymbols( uint64_t addr ) { - if( ( addr >> 63 ) != 0 ) + if( IsKernelAddress( addr ) ) { - if( s_krnlCache ) - { - auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } ); - if( it != s_krnlCache + s_krnlCacheCnt ) - { - return ModuleNameAndBaseAddress{ it->mod, it->addr }; - } - } + const ImageEntry* entry = s_krnlCache->GetImageForAddress( addr ); + if( entry != nullptr ) return ModuleNameAndBaseAddress{ entry->m_name, entry->m_startAddress }; return ModuleNameAndBaseAddress{ "", addr }; } - for( auto& v : *s_modCache ) - { - if( addr >= v.start && addr < v.end ) - { - return ModuleNameAndBaseAddress{ v.name, v.start }; - } - } + const ImageEntry* entry = s_imageCache->GetImageForAddress( addr ); + if( entry != nullptr ) return ModuleNameAndBaseAddress{ entry->m_name, entry->m_startAddress }; - HMODULE mod[1024]; - DWORD needed; HANDLE proc = GetCurrentProcess(); + // Do not use FreeLibrary because we set the flag GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT + // see https://learn.microsoft.com/en-us/windows/win32/api/libloaderapi/nf-libloaderapi-getmodulehandleexa to get more information + constexpr DWORD flag = GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT; + HMODULE mod = NULL; InitRpmalloc(); - if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 ) + if( GetModuleHandleExA( flag, (char*)addr, &mod ) != 0 ) { - const auto sz = needed / sizeof( HMODULE ); - for( size_t i=0; i= base && addr < ( base + info.SizeOfImage ) ) { - const auto base = uint64_t( info.lpBaseOfDll ); - if( addr >= base && addr < base + info.SizeOfImage ) + char name[1024]; + const auto nameLength = GetModuleFileNameA( mod, name, sizeof( name ) ); + if( nameLength > 0 ) { - char name[1024]; - const auto nameLength = GetModuleFileNameA( mod[i], name, 1021 ); - if( nameLength > 0 ) - { - // since this is the first time we encounter this module, load its symbols (needed for modules loaded after SymInitialize) - ModuleCache* cachedModule = LoadSymbolsForModuleAndCache( name, nameLength, (DWORD64)info.lpBaseOfDll, info.SizeOfImage ); - return ModuleNameAndBaseAddress{ cachedModule->name, cachedModule->start }; - } + // since this is the first time we encounter this module, load its symbols (needed for modules loaded after SymInitialize) + ImageEntry* cachedModule = LoadSymbolsForModuleAndCache( name, nameLength, (DWORD64)info.lpBaseOfDll, info.SizeOfImage ); + return ModuleNameAndBaseAddress{ cachedModule->m_name, cachedModule->m_startAddress }; } } } @@ -758,7 +827,7 @@ CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) return { cb_data, uint8_t( cb_num ), moduleNameAndAddress.name }; } -#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 +#elif defined(TRACY_USE_LIBBACKTRACE) enum { MaxCbTrace = 64 }; @@ -767,9 +836,6 @@ struct backtrace_state* cb_bts = nullptr; int cb_num; CallstackEntry cb_data[MaxCbTrace]; int cb_fixup; -#ifdef TRACY_USE_IMAGE_CACHE -static ImageCache* s_imageCache = nullptr; -#endif //#ifdef TRACY_USE_IMAGE_CACHE #ifdef TRACY_DEBUGINFOD debuginfod_client* s_debuginfod; @@ -964,10 +1030,9 @@ void InitCallstack() { InitRpmalloc(); -#ifdef TRACY_USE_IMAGE_CACHE - s_imageCache = (ImageCache*)tracy_malloc( sizeof( ImageCache ) ); - new(s_imageCache) ImageCache(); -#endif //#ifdef TRACY_USE_IMAGE_CACHE +#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE + CreateImageCaches(); +#endif //#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE #ifndef TRACY_SYMBOL_OFFLINE_RESOLVE s_shouldResolveSymbolsOffline = ShouldResolveSymbolsOffline(); @@ -1061,13 +1126,9 @@ debuginfod_client* GetDebuginfodClient() void EndCallstack() { -#ifdef TRACY_USE_IMAGE_CACHE - if( s_imageCache ) - { - s_imageCache->~ImageCache(); - tracy_free( s_imageCache ); - } -#endif //#ifdef TRACY_USE_IMAGE_CACHE +#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE + DestroyImageCaches(); +#endif //#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE #ifndef TRACY_DEMANGLE ___tracy_free_demangle_buffer(); #endif @@ -1257,17 +1318,17 @@ void GetSymbolForOfflineResolve(void* address, uint64_t imageBaseAddress, Callst CallstackEntryData DecodeCallstackPtr( uint64_t ptr ) { InitRpmalloc(); - if( ptr >> 63 == 0 ) + if ( !IsKernelAddress( ptr ) ) { const char* imageName = nullptr; uint64_t imageBaseAddress = 0x0; -#ifdef TRACY_USE_IMAGE_CACHE - const auto* image = s_imageCache->GetImageForAddress((void*)ptr); +#ifdef TRACY_HAS_DL_ITERATE_PHDR_TO_REFRESH_IMAGE_CACHE + const auto* image = s_imageCache->GetImageForAddress( ptr ); if( image ) { imageName = image->m_name; - imageBaseAddress = uint64_t(image->m_startAddress); + imageBaseAddress = uint64_t( image->m_startAddress ); } #else Dl_info dlinfo; diff --git a/libs/tracy/client/TracyCallstack.h b/libs/tracy/client/TracyCallstack.h index 2c7ecad..2df1542 100644 --- a/libs/tracy/client/TracyCallstack.h +++ b/libs/tracy/client/TracyCallstack.h @@ -8,8 +8,8 @@ # endif # if defined _WIN32 -# include "../common/TracyUwp.hpp" -# ifndef TRACY_UWP +# include "../common/TracyWinFamily.hpp" +# if !defined TRACY_WIN32_NO_DESKTOP # define TRACY_HAS_CALLSTACK 1 # endif # elif defined __ANDROID__ @@ -30,6 +30,10 @@ # define TRACY_HAS_CALLSTACK 6 # endif +#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 +#define TRACY_USE_LIBBACKTRACE +#endif + #endif #endif diff --git a/libs/tracy/client/TracyCallstack.hpp b/libs/tracy/client/TracyCallstack.hpp index 1d8cd65..7d8ed6e 100644 --- a/libs/tracy/client/TracyCallstack.hpp +++ b/libs/tracy/client/TracyCallstack.hpp @@ -1,10 +1,25 @@ #ifndef __TRACYCALLSTACK_HPP__ #define __TRACYCALLSTACK_HPP__ +#include + #include "../common/TracyApi.h" #include "../common/TracyForceInline.hpp" #include "TracyCallstack.h" +namespace tracy +{ + +struct ImageEntry +{ + uint64_t m_startAddress = 0; + uint64_t m_endAddress = 0; + char* m_name = nullptr; + char* m_path = nullptr; +}; + +} + #ifndef TRACY_HAS_CALLSTACK namespace tracy diff --git a/libs/tracy/client/TracyLock.hpp b/libs/tracy/client/TracyLock.hpp index d12a3c1..e00b344 100644 --- a/libs/tracy/client/TracyLock.hpp +++ b/libs/tracy/client/TracyLock.hpp @@ -219,8 +219,9 @@ class Lockable m_ctx.CustomName( name, size ); } -private: T m_lockable; + +private: LockableCtx m_ctx; }; @@ -535,8 +536,9 @@ class SharedLockable m_ctx.CustomName( name, size ); } -private: T m_lockable; + +private: SharedLockableCtx m_ctx; }; diff --git a/libs/tracy/client/TracyProfiler.cpp b/libs/tracy/client/TracyProfiler.cpp index 2283076..b4413cb 100644 --- a/libs/tracy/client/TracyProfiler.cpp +++ b/libs/tracy/client/TracyProfiler.cpp @@ -9,7 +9,7 @@ # include # include # include -# include "../common/TracyUwp.hpp" +# include "../common/TracyWinFamily.hpp" # ifndef _MSC_VER # include # endif @@ -327,7 +327,13 @@ static inline void CpuId( uint32_t* regs, uint32_t leaf ) static void InitFailure( const char* msg ) { -#if defined _WIN32 +#if defined TRACY_GDK + const char* format = "Tracy Profiler initialization failure: %s\n"; + const int length = snprintf( nullptr, 0, format, msg ); + char* buffer = (char*)alloca( length + 1 ); + snprintf( buffer, length + 1, format, msg ); + OutputDebugStringA( buffer ); +#elif defined _WIN32 bool hasConsole = false; bool reopen = false; const auto attached = AttachConsole( ATTACH_PARENT_PROCESS ); @@ -510,7 +516,7 @@ static const char* GetHostInfo() static char buf[1024]; auto ptr = buf; #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP auto GetVersion = &::GetVersionEx; # else auto GetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" ); @@ -593,7 +599,7 @@ static const char* GetHostInfo() char hostname[512]; gethostname( hostname, 512 ); -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP const char* user = ""; # else DWORD userSz = UNLEN+1; @@ -804,7 +810,7 @@ static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, return msg; } -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER static DWORD s_profilerThreadId = 0; static DWORD s_symbolThreadId = 0; static char s_crashText[1024]; @@ -1165,6 +1171,54 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) } #endif +#ifdef TRACY_HAS_SYSTEM_TRACING +static void StartSystemTracing( int64_t& samplingPeriod ) +{ + assert( s_sysTraceThread == nullptr ); + + // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing (even if available in the underlying system) + // as it can have significant impact on the size of the traces + const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" ); + const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1'); + if( disableSystrace ) + { + TracyDebug("TRACY: Sys Trace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); + } + else if( SysTraceStart( samplingPeriod ) ) + { + s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); + std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) ); + } +} + +static void StopSystemTracing() +{ + if( s_sysTraceThread ) + { + SysTraceStop(); + s_sysTraceThread->~Thread(); + tracy_free( s_sysTraceThread ); + s_sysTraceThread = nullptr; + } +} +#endif + +bool Profiler::BeginSamplingProfiling() +{ +#if !defined(TRACY_HAS_SYSTEM_TRACING) + return false; +#elif defined(TRACY_SAMPLING_PROFILER_MANUAL_START) + StartSystemTracing( m_samplingPeriod ); +#endif + return true; +} +void Profiler::EndSamplingProfiling() +{ +#if defined(TRACY_HAS_SYSTEM_TRACING) && defined(TRACY_SAMPLING_PROFILER_MANUAL_START) + StopSystemTracing(); +#endif +} enum { QueuePrealloc = 256 * 1024 }; @@ -1404,6 +1458,9 @@ TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; } TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; } TRACY_API bool ProfilerAllocatorAvailable() { return !RpThreadShutdown; } +TRACY_API bool BeginSamplingProfiling() { return GetProfiler().BeginSamplingProfiling(); } +TRACY_API void EndSamplingProfiling() { return GetProfiler().EndSamplingProfiling(); } + constexpr static size_t SafeSendBufferSize = 65536; Profiler::Profiler() @@ -1435,6 +1492,7 @@ Profiler::Profiler() , m_isConnected( false ) #ifdef TRACY_ON_DEMAND , m_connectionId( 0 ) + , m_symbolsBusy( false ) , m_deferredQueue( 64*1024 ) #endif , m_paramCallback( nullptr ) @@ -1518,7 +1576,7 @@ void Profiler::InstallCrashHandler() sigaction( SIGABRT, &crashHandler, &m_prevSignal.abrt ); #endif -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER // We cannot use Vectored Exception handling because it catches application-wide frame-based SEH blocks. We only // want to catch unhandled exceptions. m_prevHandler = reinterpret_cast( SetUnhandledExceptionFilter( CrashFilter ) ); @@ -1532,7 +1590,7 @@ void Profiler::InstallCrashHandler() void Profiler::RemoveCrashHandler() { -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER if( m_crashHandlerInstalled ) { auto prev = SetUnhandledExceptionFilter( (LPTOP_LEVEL_EXCEPTION_FILTER)m_prevHandler ); @@ -1562,21 +1620,8 @@ void Profiler::RemoveCrashHandler() void Profiler::SpawnWorkerThreads() { -#ifdef TRACY_HAS_SYSTEM_TRACING - // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing (even if available in the underlying system) - // as it can have significant impact on the size of the traces - const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" ); - const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1'); - if( disableSystrace ) - { - TracyDebug("TRACY: Sys Trace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); - } - else if( SysTraceStart( m_samplingPeriod ) ) - { - s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); - new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); - std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) ); - } +#if defined(TRACY_HAS_SYSTEM_TRACING) && !defined(TRACY_SAMPLING_PROFILER_MANUAL_START) + StartSystemTracing( m_samplingPeriod ); #endif s_thread = (Thread*)tracy_malloc( sizeof( Thread ) ); @@ -1592,7 +1637,7 @@ void Profiler::SpawnWorkerThreads() new(s_symbolThread) Thread( LaunchSymbolWorker, this ); #endif -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER s_profilerThreadId = GetThreadId( s_thread->Handle() ); # ifdef TRACY_HAS_CALLSTACK s_symbolThreadId = GetThreadId( s_symbolThread->Handle() ); @@ -1613,12 +1658,7 @@ Profiler::~Profiler() RemoveCrashHandler(); #ifdef TRACY_HAS_SYSTEM_TRACING - if( s_sysTraceThread ) - { - SysTraceStop(); - s_sysTraceThread->~Thread(); - tracy_free( s_sysTraceThread ); - } + StopSystemTracing(); #endif #ifdef TRACY_HAS_CALLSTACK @@ -1771,7 +1811,6 @@ void Profiler::Worker() MemWrite( &welcome.timerMul, m_timerMul ); MemWrite( &welcome.initBegin, GetInitTime() ); MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) ); - MemWrite( &welcome.delay, m_delay ); MemWrite( &welcome.resolution, m_resolution ); MemWrite( &welcome.epoch, m_epoch ); MemWrite( &welcome.exectime, m_exectime ); @@ -1939,6 +1978,8 @@ void Profiler::Worker() } #ifdef TRACY_ON_DEMAND + while( m_symbolsBusy.load( std::memory_order_acquire ) ) { YieldThread(); } + m_symbolsBusy.store( true, std::memory_order_release ); const auto currentTime = GetTime(); ClearQueues( token ); m_connectionId.fetch_add( 1, std::memory_order_release ); @@ -2011,7 +2052,6 @@ void Profiler::Worker() } else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty ) { - if( ShouldExit() ) break; if( m_bufferOffset != m_bufferStart ) { if( !CommitData() ) break; @@ -2042,7 +2082,7 @@ void Profiler::Worker() connActive = HandleServerQuery(); if( !connActive ) break; } - if( !connActive ) break; + if( !connActive || ShouldExit() ) break; } if( ShouldExit() ) break; @@ -2108,7 +2148,13 @@ void Profiler::Worker() while( s_symbolThreadGone.load() == false ) { YieldThread(); } #endif - // Client is exiting. Send items remaining in queues. + // Client is exiting. +#ifdef TRACY_HAS_SYSTEM_TRACING + // Stop filling queues with new data. + StopSystemTracing(); +#endif + + // Send items remaining in queues. for(;;) { const auto status = Dequeue( token ); @@ -2359,6 +2405,10 @@ static void FreeAssociatedMemory( const QueueItem& item ) tracy_free( (void*)ptr ); break; #endif + case QueueType::GpuAnnotationName: + ptr = MemRead( &item.gpuAnnotationNameFat.ptr ); + tracy_free( (void*)ptr ); + break; #ifdef TRACY_ON_DEMAND case QueueType::MessageAppInfo: case QueueType::GpuContextName: @@ -2574,6 +2624,12 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) tracy_free_fast( (void*)ptr ); #endif break; + case QueueType::GpuAnnotationName: + ptr = MemRead( &item->gpuAnnotationNameFat.ptr ); + size = MemRead( &item->gpuAnnotationNameFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; case QueueType::PlotDataInt: case QueueType::PlotDataFloat: case QueueType::PlotDataDouble: @@ -2793,9 +2849,13 @@ Profiler::DequeueStatus Profiler::DequeueSerial() } } + DequeueStatus dequeueStatus = DequeueStatus::QueueEmpty; + const auto sz = m_serialDequeue.size(); if( sz > 0 ) { + dequeueStatus = DequeueStatus::DataDequeued; + InitRpmalloc(); int64_t refSerial = m_refTimeSerial; int64_t refGpu = m_refTimeGpu; @@ -2932,6 +2992,14 @@ Profiler::DequeueStatus Profiler::DequeueSerial() #endif break; } + case QueueType::GpuAnnotationName: + { + ptr = MemRead( &item->gpuAnnotationNameFat.ptr ); + uint16_t size = MemRead( &item->gpuAnnotationNameFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; + } #ifdef TRACY_FIBERS case QueueType::ZoneBegin: case QueueType::ZoneBeginCallstack: @@ -3084,7 +3152,10 @@ Profiler::DequeueStatus Profiler::DequeueSerial() } } #endif - if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost; + if( dequeueStatus != DequeueStatus::ConnectionLost && !AppendData( item, QueueDataSize[idx] ) ) + { + dequeueStatus = DequeueStatus::ConnectionLost; + } item++; } m_refTimeSerial = refSerial; @@ -3094,11 +3165,7 @@ Profiler::DequeueStatus Profiler::DequeueSerial() #endif m_serialDequeue.clear(); } - else - { - return DequeueStatus::QueueEmpty; - } - return DequeueStatus::DataDequeued; + return dequeueStatus; } Profiler::ThreadCtxStatus Profiler::ThreadCtxCheck( uint32_t threadId ) @@ -3559,6 +3626,7 @@ void Profiler::SymbolWorker() } while( m_symbolQueue.front() ) m_symbolQueue.pop(); std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) ); + m_symbolsBusy.store( false, std::memory_order_release ); continue; } #endif @@ -3811,43 +3879,6 @@ void Profiler::CalibrateDelay() if( dti > 0 && dti < mindiff ) mindiff = dti; } m_resolution = mindiff; - -#ifdef TRACY_DELAYED_INIT - m_delay = m_resolution; -#else - constexpr int Events = Iterations * 2; // start + end - static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" ); - - static const tracy::SourceLocationData __tracy_source_location { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; - const auto t0 = GetTime(); - for( int i=0; izoneBegin.time, Profiler::GetTime() ); - MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location ); - TracyLfqCommit; - } - { - TracyLfqPrepare( QueueType::ZoneEnd ); - MemWrite( &item->zoneEnd.time, GetTime() ); - TracyLfqCommit; - } - } - const auto t1 = GetTime(); - const auto dt = t1 - t0; - m_delay = dt / Events; - - moodycamel::ConsumerToken token( GetQueue() ); - int left = Events; - while( left != 0 ) - { - const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, [](QueueItem* item, size_t sz){} ); - assert( sz > 0 ); - left -= (int)sz; - } - assert( GetQueue().size_approx() == 0 ); -#endif } void Profiler::ReportTopology() @@ -3862,7 +3893,7 @@ void Profiler::ReportTopology() }; #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = &::GetLogicalProcessorInformationEx; # else t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" ); @@ -4767,6 +4798,11 @@ TRACY_API void ___tracy_emit_gpu_new_context_serial( ___tracy_gpu_new_context_da tracy::MemWrite( &item->gpuNewContext.context, data.context ); tracy::MemWrite( &item->gpuNewContext.flags, data.flags ); tracy::MemWrite( &item->gpuNewContext.type, data.type ); + +#ifdef TRACY_ON_DEMAND + tracy::GetProfiler().DeferItem( *item ); +#endif + tracy::Profiler::QueueSerialFinish(); } @@ -4780,6 +4816,11 @@ TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_ tracy::MemWrite( &item->gpuContextNameFat.context, data.context ); tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); tracy::MemWrite( &item->gpuContextNameFat.size, data.len ); + +#ifdef TRACY_ON_DEMAND + tracy::GetProfiler().DeferItem( *item ); +#endif + tracy::Profiler::QueueSerialFinish(); } @@ -5000,6 +5041,14 @@ TRACY_API int32_t ___tracy_profiler_started( void ) } # endif +TRACY_API int ___tracy_begin_sampling_profiling( void ) { + return tracy::BeginSamplingProfiling() ? 1 : 0; +} + +TRACY_API void ___tracy_end_sampling_profiling( void ) { + tracy::EndSamplingProfiling(); +} + #ifdef __cplusplus } #endif diff --git a/libs/tracy/client/TracyProfiler.hpp b/libs/tracy/client/TracyProfiler.hpp index 8d16905..aacfa16 100644 --- a/libs/tracy/client/TracyProfiler.hpp +++ b/libs/tracy/client/TracyProfiler.hpp @@ -58,6 +58,9 @@ TRACY_API bool IsProfilerStarted(); # define TracyIsStarted true #endif +TRACY_API bool BeginSamplingProfiling(); +TRACY_API void EndSamplingProfiling(); + class GpuCtx; class Profiler; class Socket; @@ -252,6 +255,9 @@ class Profiler #endif } + bool BeginSamplingProfiling(); + void EndSamplingProfiling(); + tracy_force_inline uint32_t GetNextZoneId() { return m_zoneId.fetch_add( 1, std::memory_order_relaxed ); @@ -720,6 +726,9 @@ class Profiler #ifdef TRACY_FIBERS static tracy_force_inline void EnterFiber( const char* fiber, int32_t groupHint ) { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif TracyQueuePrepare( QueueType::FiberEnter ); MemWrite( &item->fiberEnter.time, GetTime() ); MemWrite( &item->fiberEnter.fiber, (uint64_t)fiber ); @@ -729,6 +738,9 @@ class Profiler static tracy_force_inline void LeaveFiber() { +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +#endif TracyQueuePrepare( QueueType::FiberLeave ); MemWrite( &item->fiberLeave.time, GetTime() ); TracyQueueCommit( fiberLeave ); @@ -991,7 +1003,6 @@ class Profiler double m_timerMul; uint64_t m_resolution; - uint64_t m_delay; std::atomic m_timeBegin; uint32_t m_mainThread; uint64_t m_epoch, m_exectime; @@ -1032,6 +1043,7 @@ class Profiler std::atomic m_isConnected; #ifdef TRACY_ON_DEMAND std::atomic m_connectionId; + std::atomic m_symbolsBusy; TracyMutex m_deferredLock; FastVector m_deferredQueue; diff --git a/libs/tracy/client/TracyRocprof.cpp b/libs/tracy/client/TracyRocprof.cpp new file mode 100644 index 0000000..370e42e --- /dev/null +++ b/libs/tracy/client/TracyRocprof.cpp @@ -0,0 +1,556 @@ +#include "../server/tracy_robin_hood.h" +#include "TracyProfiler.hpp" +#include "TracyThread.hpp" +#include "tracy/TracyC.h" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL( result, msg ) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if( CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS ) \ + { \ + std::string status_msg = rocprofiler_get_status_string( CHECKSTATUS ); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg << " failed with error code " \ + << CHECKSTATUS << ": " << status_msg << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" << status_msg \ + << ")"; \ + throw std::runtime_error( errmsg.str() ); \ + } \ + } + +namespace +{ + +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; + +struct DispatchData +{ + int64_t launch_start; + int64_t launch_end; + uint32_t thread_id; + uint16_t query_id; +}; + +struct ToolData +{ + uint32_t version; + const char* runtime_version; + uint32_t priority; + rocprofiler_client_id_t client_id; + uint8_t context_id; + bool init; + uint64_t query_id; + int64_t previous_cpu_time; + tracy::unordered_map client_kernels; + tracy::unordered_map dispatch_data; + tracy::unordered_set counter_names = { "SQ_WAVES", "GL2C_MISS", "GL2C_HIT" }; + std::unique_ptr cal_thread; + std::mutex mut{}; +}; + +using namespace tracy; + +rocprofiler_context_id_t& get_client_ctx() +{ + static rocprofiler_context_id_t ctx{ 0 }; + return ctx; +} + +const char* CTX_NAME = "rocprofv3"; + +uint8_t gpu_context_allocate( ToolData* data ) +{ + + timespec ts; + clock_gettime( CLOCK_BOOTTIME, &ts ); + uint64_t cpu_timestamp = Profiler::GetTime(); + uint64_t gpu_timestamp = ( (uint64_t)ts.tv_sec * 1000000000 ) + ts.tv_nsec; + float timestamp_period = 1.0f; + data->previous_cpu_time = cpu_timestamp; + + // Allocate the process-unique GPU context ID. There's a max of 255 available; + // if we are recreating devices a lot we may exceed that. Don't do that, or + // wrap around and get weird (but probably still usable) numbers. + uint8_t context_id = tracy::GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ); + if( context_id >= 255 ) + { + context_id %= 255; + } + + uint8_t context_flags = 0; +#ifdef TRACY_ROCPROF_CALIBRATION + // Tell tracy we'll be passing calibrated timestamps and not to mess with + // the times. We'll periodically send GpuCalibration events in case the + // times drift. + context_flags |= tracy::GpuContextCalibration; +#endif + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext ); + tracy::MemWrite( &item->gpuNewContext.cpuTime, cpu_timestamp ); + tracy::MemWrite( &item->gpuNewContext.gpuTime, gpu_timestamp ); + memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) ); + tracy::MemWrite( &item->gpuNewContext.period, timestamp_period ); + tracy::MemWrite( &item->gpuNewContext.context, context_id ); + tracy::MemWrite( &item->gpuNewContext.flags, context_flags ); + tracy::MemWrite( &item->gpuNewContext.type, tracy::GpuContextType::Rocprof ); + tracy::Profiler::QueueSerialFinish(); + } + + // Send the name of the context along. + // NOTE: Tracy will unconditionally free the name so we must clone it here. + // Since internally Tracy will use its own rpmalloc implementation we must + // make sure we allocate from the same source. + size_t name_length = strlen( CTX_NAME ); + char* cloned_name = (char*)tracy::tracy_malloc( name_length ); + memcpy( cloned_name, CTX_NAME, name_length ); + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName ); + tracy::MemWrite( &item->gpuContextNameFat.context, context_id ); + tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)cloned_name ); + tracy::MemWrite( &item->gpuContextNameFat.size, name_length ); + tracy::Profiler::QueueSerialFinish(); + } + + return context_id; +} + +uint64_t kernel_src_loc( ToolData* data, uint64_t kernel_id ) +{ + uint64_t src_loc = 0; + auto _lk = std::unique_lock{ data->mut }; + rocprofiler_kernel_id_t kid = kernel_id; + if( data->client_kernels.count( kid ) ) + { + auto& sym_data = data->client_kernels[kid]; + const char* name = sym_data.kernel_name; + size_t name_len = strlen( name ); + uint32_t line = 0; + src_loc = tracy::Profiler::AllocSourceLocation( line, NULL, 0, name, name_len, NULL, 0 ); + } + return src_loc; +} + +void record_interval( ToolData* data, rocprofiler_timestamp_t start_timestamp, rocprofiler_timestamp_t end_timestamp, + uint64_t src_loc, rocprofiler_dispatch_id_t dispatch_id ) +{ + + uint16_t query_id = 0; + uint8_t context_id = data->context_id; + + { + auto _lk = std::unique_lock{ data->mut }; + query_id = data->query_id; + data->query_id++; + if( dispatch_id != UINT64_MAX ) + { + DispatchData& dispatch_data = data->dispatch_data[dispatch_id]; + dispatch_data.query_id = query_id; + dispatch_data.thread_id = tracy::GetThreadHandle(); + } + } + + uint64_t cpu_start_time = 0, cpu_end_time = 0; + if( dispatch_id == UINT64_MAX ) + { + cpu_start_time = tracy::Profiler::GetTime(); + cpu_end_time = tracy::Profiler::GetTime(); + } + else + { + auto _lk = std::unique_lock{ data->mut }; + DispatchData& dispatch_data = data->dispatch_data[dispatch_id]; + cpu_start_time = dispatch_data.launch_start; + cpu_end_time = dispatch_data.launch_end; + } + + if( src_loc != 0 ) + { + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)src_loc ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id ); + tracy::MemWrite( &item->gpuZoneBegin.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + } + else + { + static const ___tracy_source_location_data src_loc = { NULL, NULL, NULL, 0, 0 }; + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)&src_loc ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id ); + tracy::MemWrite( &item->gpuZoneBegin.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + } + + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); + tracy::MemWrite( &item->gpuTime.gpuTime, start_timestamp ); + tracy::MemWrite( &item->gpuTime.queryId, query_id ); + tracy::MemWrite( &item->gpuTime.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial ); + tracy::MemWrite( &item->gpuZoneEnd.cpuTime, cpu_end_time ); + tracy::MemWrite( &item->gpuZoneEnd.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneEnd.queryId, query_id ); + tracy::MemWrite( &item->gpuZoneEnd.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); + tracy::MemWrite( &item->gpuTime.gpuTime, end_timestamp ); + tracy::MemWrite( &item->gpuTime.queryId, query_id ); + tracy::MemWrite( &item->gpuTime.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } +} + +void record_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_record_counter_t* record_data, size_t record_count, + rocprofiler_user_data_t /*user_data*/, void* callback_data ) +{ + assert( callback_data != nullptr ); + ToolData* data = static_cast( callback_data ); + if( !data->init ) return; + + std::unordered_map sums; + for( size_t i = 0; i < record_count; ++i ) + { + auto _counter_id = rocprofiler_counter_id_t{}; + ROCPROFILER_CALL( rocprofiler_query_record_counter_id( record_data[i].id, &_counter_id ), + "query record counter id" ); + sums[_counter_id.handle] += record_data[i].counter_value; + } + + uint16_t query_id = 0; + uint32_t thread_id = 0; + { + auto _lk = std::unique_lock{ data->mut }; + // An assumption is made here that the counter values are supplied after the dispatch + // complete callback. + assert( data->dispatch_data.count( dispatch_data.dispatch_info.dispatch_id ) ); + DispatchData& ddata = data->dispatch_data[dispatch_data.dispatch_info.dispatch_id]; + query_id = ddata.query_id; + thread_id = ddata.thread_id; + } + + for( auto& p : sums ) + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneAnnotation ); + tracy::MemWrite( &item->zoneAnnotation.noteId, p.first ); + tracy::MemWrite( &item->zoneAnnotation.queryId, query_id ); + tracy::MemWrite( &item->zoneAnnotation.thread, thread_id ); + tracy::MemWrite( &item->zoneAnnotation.value, p.second ); + tracy::MemWrite( &item->zoneAnnotation.context, data->context_id ); + tracy::Profiler::QueueSerialFinish(); + } +} + +/** + * Callback from rocprofiler when an kernel dispatch is enqueued into the HSA queue. + * rocprofiler_counter_config_id_t* is a return to specify what counters to collect + * for this dispatch (dispatch_packet). + */ +void dispatch_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, rocprofiler_user_data_t* /*user_data*/, + void* callback_data ) +{ + assert( callback_data != nullptr ); + ToolData* data = static_cast( callback_data ); + if( !data->init ) return; + + /** + * This simple example uses the same profile counter set for all agents. + * We store this in a cache to prevent constructing many identical profile counter + * sets. We first check the cache to see if we have already constructed a counter" + * set for the agent. If we have, return it. Otherwise, construct a new profile counter + * set. + */ + static std::shared_mutex m_mutex = {}; + static std::unordered_map profile_cache = {}; + + auto search_cache = [&]() + { + if( auto pos = profile_cache.find( dispatch_data.dispatch_info.agent_id.handle ); pos != profile_cache.end() ) + { + *config = pos->second; + return true; + } + return false; + }; + + { + auto rlock = std::shared_lock{ m_mutex }; + if( search_cache() ) return; + } + + auto wlock = std::unique_lock{ m_mutex }; + if( search_cache() ) return; + + // GPU Counter IDs + std::vector gpu_counters; + + // Iterate through the agents and get the counters available on that agent + ROCPROFILER_CALL( + rocprofiler_iterate_agent_supported_counters( + dispatch_data.dispatch_info.agent_id, + []( rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, void* user_data ) + { + std::vector* vec = + static_cast*>( user_data ); + for( size_t i = 0; i < num_counters; i++ ) + { + vec->push_back( counters[i] ); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast( &gpu_counters ) ), + "Could not fetch supported counters" ); + + std::vector collect_counters; + collect_counters.reserve( data->counter_names.size() ); + // Look for the counters contained in counters_to_collect in gpu_counters + for( auto& counter : gpu_counters ) + { + rocprofiler_counter_info_v0_t info; + ROCPROFILER_CALL( + rocprofiler_query_counter_info( counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast( &info ) ), + "Could not query info" ); + if( data->counter_names.count( std::string( info.name ) ) > 0 ) + { + collect_counters.push_back( counter ); + + size_t name_length = strlen( info.name ); + char* cloned_name = (char*)tracy::tracy_malloc( name_length ); + memcpy( cloned_name, info.name, name_length ); + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuAnnotationName ); + tracy::MemWrite( &item->gpuAnnotationNameFat.context, data->context_id ); + tracy::MemWrite( &item->gpuAnnotationNameFat.noteId, counter.handle ); + tracy::MemWrite( &item->gpuAnnotationNameFat.ptr, (uint64_t)cloned_name ); + tracy::MemWrite( &item->gpuAnnotationNameFat.size, name_length ); + tracy::Profiler::QueueSerialFinish(); + } + } + } + + // Create a colleciton profile for the counters + rocprofiler_profile_config_id_t profile = { .handle = 0 }; + ROCPROFILER_CALL( rocprofiler_create_profile_config( dispatch_data.dispatch_info.agent_id, collect_counters.data(), + collect_counters.size(), &profile ), + "Could not construct profile cfg" ); + + profile_cache.emplace( dispatch_data.dispatch_info.agent_id.handle, profile ); + // Return the profile to collect those counters for this dispatch + *config = profile; +} + +void tool_callback_tracing_callback( rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, + void* callback_data ) +{ + assert( callback_data != nullptr ); + ToolData* data = static_cast( callback_data ); + if( !data->init ) return; + + if( record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER ) + { + auto* sym_data = static_cast( record.payload ); + + if( record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD ) + { + auto _lk = std::unique_lock{ data->mut }; + data->client_kernels.emplace( sym_data->kernel_id, *sym_data ); + } + else if( record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD ) + { + auto _lk = std::unique_lock{ data->mut }; + data->client_kernels.erase( sym_data->kernel_id ); + } + } + else if( record.kind == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH ) + { + auto* rdata = static_cast( record.payload ); + if( record.operation == ROCPROFILER_KERNEL_DISPATCH_ENQUEUE ) + { + if( record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER ) + { + auto _lk = std::unique_lock{ data->mut }; + data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_start = tracy::Profiler::GetTime(); + } + else if( record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT ) + { + auto _lk = std::unique_lock{ data->mut }; + data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_end = tracy::Profiler::GetTime(); + } + } + else if( record.operation == ROCPROFILER_KERNEL_DISPATCH_COMPLETE ) + { + uint64_t src_loc = kernel_src_loc( data, rdata->dispatch_info.kernel_id ); + record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, + rdata->dispatch_info.dispatch_id ); + } + } + else if( record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY && + record.operation != ROCPROFILER_MEMORY_COPY_NONE && record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT ) + { + auto* rdata = static_cast( record.payload ); + const char* name = nullptr; + switch( record.operation ) + { + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: + name = "DeviceToDeviceCopy"; + break; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: + name = "DeviceToHostCopy"; + break; + case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: + name = "HostToDeviceCopy"; + break; + case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: + name = "HostToHostCopy"; + break; + } + size_t name_len = strlen( name ); + uint64_t src_loc = tracy::Profiler::AllocSourceLocation( 0, NULL, 0, name, name_len, NULL, 0 ); + record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, UINT64_MAX ); + } +} + +void calibration_thread( void* ptr ) +{ + while( !TracyIsStarted ) + ; + ToolData* data = static_cast( ptr ); + data->context_id = gpu_context_allocate( data ); + const char* user_counters = GetEnvVar( "TRACY_ROCPROF_COUNTERS" ); + if( user_counters ) + { + data->counter_names.clear(); + std::stringstream ss( user_counters ); + std::string counter; + while( std::getline( ss, counter, ',' ) ) data->counter_names.insert( counter ); + } + data->init = true; + +#ifdef TRACY_ROCPROF_CALIBRATION + while( data->init ) + { + sleep( 1 ); + + timespec ts; + // HSA performs a linear interpolation of GPU time to CLOCK_BOOTTIME. However, this is + // subject to network time updates and can drift relative to tracy's clock. + clock_gettime( CLOCK_BOOTTIME, &ts ); + int64_t cpu_timestamp = Profiler::GetTime(); + int64_t gpu_timestamp = ts.tv_nsec + ts.tv_sec * 1e9L; + + if( cpu_timestamp > data->previous_cpu_time ) + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration ); + tracy::MemWrite( &item->gpuCalibration.gpuTime, gpu_timestamp ); + tracy::MemWrite( &item->gpuCalibration.cpuTime, cpu_timestamp ); + tracy::MemWrite( &item->gpuCalibration.cpuDelta, cpu_timestamp - data->previous_cpu_time ); + tracy::MemWrite( &item->gpuCalibration.context, data->context_id ); + tracy::Profiler::QueueSerialFinish(); + data->previous_cpu_time = cpu_timestamp; + } + } +#endif +} + +int tool_init( rocprofiler_client_finalize_t fini_func, void* user_data ) +{ + ToolData* data = static_cast( user_data ); + data->cal_thread = std::make_unique( calibration_thread, data ); + + ROCPROFILER_CALL( rocprofiler_create_context( &get_client_ctx() ), "context creation failed" ); + + ROCPROFILER_CALL( rocprofiler_configure_callback_dispatch_counting_service( get_client_ctx(), dispatch_callback, + user_data, record_callback, user_data ), + "Could not setup counting service" ); + + rocprofiler_tracing_operation_t ops[] = { ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER }; + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ops, 1, + tool_callback_tracing_callback, user_data ), + "callback tracing service failed to configure" ); + + rocprofiler_tracing_operation_t ops2[] = { ROCPROFILER_KERNEL_DISPATCH_COMPLETE, + ROCPROFILER_KERNEL_DISPATCH_ENQUEUE }; + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service( get_client_ctx(), ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, + ops2, 2, tool_callback_tracing_callback, user_data ), + "callback tracing service failed to configure" ); + + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, nullptr, + 0, tool_callback_tracing_callback, user_data ), + "callback tracing service failed to configure" ); + + ROCPROFILER_CALL( rocprofiler_start_context( get_client_ctx() ), "start context" ); + return 0; +} + +void tool_fini( void* tool_data_v ) +{ + rocprofiler_stop_context( get_client_ctx() ); + + ToolData* data = static_cast( tool_data_v ); + data->init = false; + data->cal_thread.reset(); +} +} + +extern "C" +{ + rocprofiler_tool_configure_result_t* rocprofiler_configure( uint32_t version, const char* runtime_version, + uint32_t priority, rocprofiler_client_id_t* client_id ) + { + // If not the first tool to register, indicate that the tool doesn't want to do anything + if( priority > 0 ) return nullptr; + + // (optional) Provide a name for this tool to rocprofiler + client_id->name = "Tracy"; + + // (optional) create configure data + static ToolData data = ToolData{ version, runtime_version, priority, *client_id, 0, false, 0, 0 }; + + // construct configure result + static auto cfg = rocprofiler_tool_configure_result_t{ sizeof( rocprofiler_tool_configure_result_t ), + &tool_init, &tool_fini, static_cast( &data ) }; + + return &cfg; + } +} diff --git a/libs/tracy/client/TracyScoped.hpp b/libs/tracy/client/TracyScoped.hpp index 7f9256d..c2f7eda 100644 --- a/libs/tracy/client/TracyScoped.hpp +++ b/libs/tracy/client/TracyScoped.hpp @@ -12,6 +12,12 @@ #include "TracyProfiler.hpp" #include "TracyCallstack.hpp" +#if (defined(__GNUC__) || defined(__clang__)) +# define TRACY_ATTRIBUTE_FORMAT_PRINTF(fmt_idx, arg_idx) \ + __attribute__((format(printf, fmt_idx, arg_idx))) +#else +# define TRACY_ATTRIBUTE_FORMAT_PRINTF(fmt_idx, arg_idx) +#endif namespace tracy { @@ -99,7 +105,7 @@ class ScopedZone TracyQueueCommit( zoneTextFatThread ); } - void TextFmt( const char* fmt, ... ) + void TextFmt( const char* fmt, ... ) TRACY_ATTRIBUTE_FORMAT_PRINTF(2, 3) { if( !m_active ) return; #ifdef TRACY_ON_DEMAND @@ -138,7 +144,7 @@ class ScopedZone TracyQueueCommit( zoneTextFatThread ); } - void NameFmt( const char* fmt, ... ) + void NameFmt( const char* fmt, ... ) TRACY_ATTRIBUTE_FORMAT_PRINTF(2, 3) { if( !m_active ) return; #ifdef TRACY_ON_DEMAND diff --git a/libs/tracy/client/TracySysTime.cpp b/libs/tracy/client/TracySysTime.cpp index b690a91..cf7dd9b 100644 --- a/libs/tracy/client/TracySysTime.cpp +++ b/libs/tracy/client/TracySysTime.cpp @@ -4,6 +4,7 @@ # if defined _WIN32 # include +# include "../common/TracyWinFamily.hpp" # elif defined __linux__ # include # include @@ -27,13 +28,24 @@ static inline uint64_t ConvertTime( const FILETIME& t ) void SysTime::ReadTimes() { - FILETIME idleTime; FILETIME kernelTime; FILETIME userTime; +# if defined TRACY_GDK + FILETIME creationTime; + FILETIME exitTime; + + GetProcessTimes( GetCurrentProcess(), &creationTime, &exitTime, &kernelTime, &userTime ); + + idle = 0; +# else + FILETIME idleTime; + GetSystemTimes( &idleTime, &kernelTime, &userTime ); idle = ConvertTime( idleTime ); +# endif + const auto kernel = ConvertTime( kernelTime ); const auto user = ConvertTime( userTime ); used = kernel + user; diff --git a/libs/tracy/client/TracySysTrace.cpp b/libs/tracy/client/TracySysTrace.cpp index 8e7f613..e6bb356 100644 --- a/libs/tracy/client/TracySysTrace.cpp +++ b/libs/tracy/client/TracySysTrace.cpp @@ -527,25 +527,23 @@ void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const ch const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid ); if( phnd != INVALID_HANDLE_VALUE ) { - HMODULE modules[1024]; - DWORD needed; - if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 ) + MEMORY_BASIC_INFORMATION vmeminfo; + SIZE_T infosize = VirtualQueryEx( phnd, ptr, &vmeminfo, sizeof( vmeminfo ) ); + if( infosize == sizeof( vmeminfo ) ) { - const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) ); - for( DWORD i=0; i= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage ) + char buf2[1024]; + const auto modlen = _GetModuleBaseNameA( phnd, mod, buf2, 1024 ); + if( modlen != 0 ) { - char buf2[1024]; - const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 ); - if( modlen != 0 ) - { - threadName = CopyString( buf2, modlen ); - threadSent = true; - } + threadName = CopyString( buf2, modlen ); + threadSent = true; } } } @@ -759,6 +757,64 @@ static const char* ReadFile( const char* path ) return tmp; } +static const char* ReadFile( const char* base, const char* path ) +{ + const auto blen = strlen( base ); + const auto plen = strlen( path ); + + auto tmp = (char*)tracy_malloc( blen + plen + 1 ); + memcpy( tmp, base, blen ); + memcpy( tmp + blen, path, plen ); + tmp[blen+plen] = '\0'; + + auto res = ReadFile( tmp ); + tracy_free( tmp ); + return res; +} + +static char* GetTraceFsPath() +{ + int fd = open( "/proc/mounts", O_RDONLY ); + if( fd < 0 ) return nullptr; + + constexpr size_t BufSize = 64 * 1024; + auto tmp = (char*)tracy_malloc( BufSize ); + const auto cnt = read( fd, tmp, BufSize-1 ); + close( fd ); + if( cnt < 0 ) + { + tracy_free( tmp ); + return nullptr; + } + tmp[cnt] = '\0'; + + auto ptr = tmp; + while( *ptr ) + { + if( strncmp( ptr, "tracefs ", 8 ) == 0 ) + { + ptr += 8; + auto end = ptr; + while( *end && *end != ' ' ) end++; + if( !*end ) + { + tracy_free( tmp ); + return nullptr; + } + const auto len = end - ptr; + auto ret = (char*)tracy_malloc( len+1 ); + memcpy( ret, ptr, len ); + ret[len] = '\0'; + return ret; + } + while( *ptr && *ptr != '\n' ) ptr++; + if( *ptr ) ptr++; + } + + tracy_free( tmp ); + return nullptr; +} + bool SysTraceStart( int64_t& samplingPeriod ) { #ifndef CLOCK_MONOTONIC_RAW @@ -773,14 +829,20 @@ bool SysTraceStart( int64_t& samplingPeriod ) TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel ); #endif + auto traceFsPath = GetTraceFsPath(); + if( !traceFsPath ) return false; + TracyDebug( "tracefs path: %s\n", traceFsPath ); + int switchId = -1, wakingId = -1, vsyncId = -1; - const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" ); + const auto switchIdStr = ReadFile( traceFsPath, "/events/sched/sched_switch/id" ); if( switchIdStr ) switchId = atoi( switchIdStr ); - const auto wakingIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_waking/id" ); + const auto wakingIdStr = ReadFile( traceFsPath, "/events/sched/sched_waking/id" ); if( wakingIdStr ) wakingId = atoi( wakingIdStr ); - const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" ); + const auto vsyncIdStr = ReadFile( traceFsPath, "/events/drm/drm_vblank_event/id" ); if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr ); + tracy_free( traceFsPath ); + TracyDebug( "sched_switch id: %i\n", switchId ); TracyDebug( "sched_waking id: %i\n", wakingId ); TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); diff --git a/libs/tracy/client/TracySysTrace.hpp b/libs/tracy/client/TracySysTrace.hpp index 8c663cd..2a28e8b 100644 --- a/libs/tracy/client/TracySysTrace.hpp +++ b/libs/tracy/client/TracySysTrace.hpp @@ -2,8 +2,8 @@ #define __TRACYSYSTRACE_HPP__ #if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __linux__ ) -# include "../common/TracyUwp.hpp" -# ifndef TRACY_UWP +# include "../common/TracyWinFamily.hpp" +# if !defined TRACY_WIN32_NO_DESKTOP # define TRACY_HAS_SYSTEM_TRACING # endif #endif diff --git a/libs/tracy/client/tracy_rpmalloc.cpp b/libs/tracy/client/tracy_rpmalloc.cpp index 315a40f..c43b8ca 100644 --- a/libs/tracy/client/tracy_rpmalloc.cpp +++ b/libs/tracy/client/tracy_rpmalloc.cpp @@ -2780,7 +2780,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) { _memory_huge_pages = 1; } -#if PLATFORM_WINDOWS +#if PLATFORM_WINDOWS && !defined TRACY_GDK if (_memory_config.enable_huge_pages) { HANDLE token = 0; size_t large_page_minimum = GetLargePageMinimum(); diff --git a/libs/tracy/common/TracyProtocol.hpp b/libs/tracy/common/TracyProtocol.hpp index 40cf5e6..ff38686 100644 --- a/libs/tracy/common/TracyProtocol.hpp +++ b/libs/tracy/common/TracyProtocol.hpp @@ -9,7 +9,7 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 74 }; +enum : uint32_t { ProtocolVersion = 76 }; enum : uint16_t { BroadcastVersion = 3 }; using lz4sz_t = uint32_t; @@ -95,7 +95,6 @@ struct WelcomeMessage double timerMul; int64_t initBegin; int64_t initEnd; - uint64_t delay; uint64_t resolution; uint64_t epoch; uint64_t exectime; diff --git a/libs/tracy/common/TracyQueue.hpp b/libs/tracy/common/TracyQueue.hpp index daef3ec..765c83c 100644 --- a/libs/tracy/common/TracyQueue.hpp +++ b/libs/tracy/common/TracyQueue.hpp @@ -61,6 +61,7 @@ enum class QueueType : uint8_t ThreadWakeup, GpuTime, GpuContextName, + GpuAnnotationName, CallstackFrameSize, SymbolInformation, ExternalNameMetadata, @@ -111,6 +112,7 @@ enum class QueueType : uint8_t SecondStringData, MemNamePayload, ThreadGroupHint, + GpuZoneAnnotation, StringData, ThreadName, PlotName, @@ -331,7 +333,7 @@ struct QueuePlotDataInt : public QueuePlotDataBase int64_t val; }; -struct QueuePlotDataFloat : public QueuePlotDataBase +struct QueuePlotDataFloat : public QueuePlotDataBase { float val; }; @@ -406,7 +408,8 @@ enum class GpuContextType : uint8_t Direct3D11, Metal, Custom, - CUDA + CUDA, + Rocprof }; enum GpuContextFlags : uint8_t @@ -446,6 +449,15 @@ struct QueueGpuZoneEnd uint8_t context; }; +struct QueueGpuZoneAnnotation +{ + int64_t noteId; + double value; + uint32_t thread; + uint16_t queryId; + uint8_t context; +}; + struct QueueGpuTime { int64_t gpuTime; @@ -467,7 +479,7 @@ struct QueueGpuTimeSync int64_t cpuTime; uint8_t context; }; - + struct QueueGpuContextName { uint8_t context; @@ -479,6 +491,18 @@ struct QueueGpuContextNameFat : public QueueGpuContextName uint16_t size; }; +struct QueueGpuAnnotationName +{ + int64_t noteId; + uint8_t context; +}; + +struct QueueGpuAnnotationNameFat : public QueueGpuAnnotationName +{ + uint64_t ptr; + uint16_t size; +}; + struct QueueMemNamePayload { uint64_t name; @@ -756,6 +780,8 @@ struct QueueItem QueueGpuTimeSync gpuTimeSync; QueueGpuContextName gpuContextName; QueueGpuContextNameFat gpuContextNameFat; + QueueGpuAnnotationName gpuAnnotationName; + QueueGpuAnnotationNameFat gpuAnnotationNameFat; QueueMemAlloc memAlloc; QueueMemFree memFree; QueueMemDiscard memDiscard; @@ -789,6 +815,7 @@ struct QueueItem QueueSourceCodeNotAvailable sourceCodeNotAvailable; QueueFiberEnter fiberEnter; QueueFiberLeave fiberLeave; + QueueGpuZoneAnnotation zoneAnnotation; }; }; #pragma pack( pop ) @@ -849,6 +876,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ), sizeof( QueueHeader ) + sizeof( QueueGpuTime ), sizeof( QueueHeader ) + sizeof( QueueGpuContextName ), + sizeof( QueueHeader ) + sizeof( QueueGpuAnnotationName ), sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ), sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ), sizeof( QueueHeader ), // ExternalNameMetadata - not for wire transfer @@ -900,6 +928,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ), // second string data sizeof( QueueHeader ) + sizeof( QueueMemNamePayload ), sizeof( QueueHeader ) + sizeof( QueueThreadGroupHint ), + sizeof( QueueHeader ) + sizeof( QueueGpuZoneAnnotation ), // GPU zone annotation // keep all QueueStringTransfer below sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // string data sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // thread name diff --git a/libs/tracy/common/TracySystem.cpp b/libs/tracy/common/TracySystem.cpp index a92a345..7696ca3 100644 --- a/libs/tracy/common/TracySystem.cpp +++ b/libs/tracy/common/TracySystem.cpp @@ -10,7 +10,7 @@ # endif # include # include -# include "TracyUwp.hpp" +# include "TracyWinFamily.hpp" #else # include # include @@ -137,7 +137,7 @@ TRACY_API void SetThreadName( const char* name ) TRACY_API void SetThreadNameWithHint( const char* name, int32_t groupHint ) { #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP static auto _SetThreadDescription = &::SetThreadDescription; # else static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" ); @@ -246,7 +246,7 @@ TRACY_API const char* GetThreadName( uint32_t id ) #endif #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP static auto _GetThreadDescription = &::GetThreadDescription; # else static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); diff --git a/libs/tracy/common/TracyVersion.hpp b/libs/tracy/common/TracyVersion.hpp index 93b6737..1b6fc48 100644 --- a/libs/tracy/common/TracyVersion.hpp +++ b/libs/tracy/common/TracyVersion.hpp @@ -6,8 +6,8 @@ namespace tracy namespace Version { enum { Major = 0 }; -enum { Minor = 12 }; -enum { Patch = 2 }; +enum { Minor = 13 }; +enum { Patch = 0 }; } } diff --git a/libs/tracy/common/TracyWinFamily.hpp b/libs/tracy/common/TracyWinFamily.hpp new file mode 100644 index 0000000..b601455 --- /dev/null +++ b/libs/tracy/common/TracyWinFamily.hpp @@ -0,0 +1,16 @@ +#ifndef __TRACYWINFAMILY_HPP__ +#define __TRACYWINFAMILY_HPP__ + +#ifdef _WIN32 +# include +# if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +# define TRACY_WIN32_NO_DESKTOP +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_GAMES) +# define TRACY_GDK +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +# define TRACY_UWP +# endif +# endif +#endif + +#endif diff --git a/libs/tracy/libbacktrace/macho.cpp b/libs/tracy/libbacktrace/macho.cpp index b9f0845..bb2e468 100644 --- a/libs/tracy/libbacktrace/macho.cpp +++ b/libs/tracy/libbacktrace/macho.cpp @@ -309,9 +309,9 @@ static const char * const dwarf_section_names[DEBUG_MAX] = "__debug_abbrev", "__debug_ranges", "__debug_str", - "", /* DEBUG_ADDR */ + "__debug_addr", "__debug_str_offs", - "", /* DEBUG_LINE_STR */ + "__debug_line_str", "__debug_rnglists" }; diff --git a/libs/tracy/tracy/Tracy.hpp b/libs/tracy/tracy/Tracy.hpp index 605d149..31289b8 100644 --- a/libs/tracy/tracy/Tracy.hpp +++ b/libs/tracy/tracy/Tracy.hpp @@ -149,10 +149,34 @@ #define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active ) #define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, TRACY_CALLSTACK, active ) -#define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true ) -#define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true ) -#define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true ) -#define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true ) +#if defined(TRACY_ALLOW_SHADOW_WARNING) + #define SuppressVarShadowWarning(Expr) Expr +#elif defined(__clang__) + #define SuppressVarShadowWarning(Expr) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wshadow\"") \ + Expr \ + _Pragma("clang diagnostic pop") +#elif defined(__GNU__) + #define SuppressVarShadowWarning(Expr) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wshadow\"") \ + Expr \ + _Pragma("GCC diagnostic pop") +#elif defined(_MSC_VER) + #define SuppressVarShadowWarning(Expr) \ + _Pragma("warning(push)") \ + _Pragma("warning(disable : 4456)") \ + Expr \ + _Pragma("warning(pop)") +#else + #define SuppressVarShadowWarning(Expr) Expr +#endif + +#define ZoneScoped SuppressVarShadowWarning( ZoneNamed( ___tracy_scoped_zone, true ) ) +#define ZoneScopedN( name ) SuppressVarShadowWarning( ZoneNamedN( ___tracy_scoped_zone, name, true ) ) +#define ZoneScopedC( color ) SuppressVarShadowWarning( ZoneNamedC( ___tracy_scoped_zone, color, true ) ) +#define ZoneScopedNC( name, color ) SuppressVarShadowWarning( ZoneNamedNC( ___tracy_scoped_zone, name, color, true ) ) #define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size ) #define ZoneTextV( varname, txt, size ) varname.Text( txt, size ) @@ -182,7 +206,7 @@ #define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, TracyFile, TracyLine, 0 }; return &srcloc; }() } #define LockableBase( type ) tracy::Lockable #define SharedLockableBase( type ) tracy::SharedLockable -#define LockMark( varname ) static constexpr tracy::SourceLocationData __tracy_lock_location_##__LINE__ { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; varname.Mark( &__tracy_lock_location_##__LINE__ ) +#define LockMark( varname ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_lock_location_,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; varname.Mark( &TracyConcat(__tracy_lock_location_,TracyLine) ) #define LockableName( varname, txt, size ) varname.CustomName( txt, size ) #define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ) diff --git a/libs/tracy/tracy/TracyC.h b/libs/tracy/tracy/TracyC.h index 1b1373e..e77c01f 100644 --- a/libs/tracy/tracy/TracyC.h +++ b/libs/tracy/tracy/TracyC.h @@ -114,6 +114,9 @@ typedef const void* TracyCLockCtx; #define TracyCIsConnected 0 #define TracyCIsStarted 0 +#define TracyCBeginSamplingProfiling() 0 +#define TracyCEndSamplingProfiling() + #ifdef TRACY_FIBERS # define TracyCFiberEnter(fiber) # define TracyCFiberLeave @@ -367,6 +370,12 @@ TRACY_API void ___tracy_custom_name_lockable_ctx( struct __tracy_lockable_contex #define TracyCIsConnected ___tracy_connected() +TRACY_API int ___tracy_begin_sampling_profiler( void ); +TRACY_API void ___tracy_end_sampling_profiler( void ); + +#define TracyCBeginSamplingProfiling() ___tracy_begin_sampling_profiling() +#define TracyCEndSamplingProfiling() ___tracy_end_sampling_profiling() + #ifdef TRACY_FIBERS TRACY_API void ___tracy_fiber_enter( const char* fiber ); TRACY_API void ___tracy_fiber_leave( void ); diff --git a/libs/tracy/tracy/TracyVulkan.hpp b/libs/tracy/tracy/TracyVulkan.hpp index 7264318..429f299 100644 --- a/libs/tracy/tracy/TracyVulkan.hpp +++ b/libs/tracy/tracy/TracyVulkan.hpp @@ -16,6 +16,7 @@ #define TracyVkZoneC(c,x,y,z) #define TracyVkZoneTransient(c,x,y,z,w) #define TracyVkCollect(c,x) +#define TracyVkCollectHost(c) #define TracyVkNamedZoneS(c,x,y,z,w,a) #define TracyVkNamedZoneCS(c,x,y,z,w,v,a) @@ -256,7 +257,9 @@ class VkCtx #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) { - VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ); + cmdbuf ? + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ) : + VK_FUNCTION_WRAPPER( vkResetQueryPool( m_device, m_query, 0, m_queryCount ) ); m_tail = head; m_oldCnt = 0; int64_t tgpu; @@ -325,7 +328,9 @@ class VkCtx } } - VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) ); + cmdbuf ? + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) ) : + VK_FUNCTION_WRAPPER( vkResetQueryPool( m_device, m_query, wrappedTail, cnt ) ); m_tail += cnt; } @@ -721,6 +726,7 @@ using TracyVkCtx = tracy::VkCtx*; # define TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) tracy::VkCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), cmdbuf, active ); #endif #define TracyVkCollect( ctx, cmdbuf ) ctx->Collect( cmdbuf ); +#define TracyVkCollectHost( ctx ) ctx->Collect( VK_NULL_HANDLE ); #ifdef TRACY_HAS_CALLSTACK # define TracyVkNamedZoneS( ctx, varname, cmdbuf, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, depth, active );