diff --git a/.gitattributes b/.gitattributes index 352d5ac595e..8cb4c160be0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -65,6 +65,7 @@ Makefile text *.asax text *.h text +*.hpp text *.cpp text *.cc text *.cu text diff --git a/.gitignore b/.gitignore index cb46296bb19..e1485b5b8c1 100644 --- a/.gitignore +++ b/.gitignore @@ -224,6 +224,7 @@ bindings/python/cntk/cntk_py.py bindings/python/cntk/libs/ bindings/python/cntk/cntk_py_wrap.cpp bindings/python/cntk/cntk_py_wrap.h +bindings/python/cntk/VERSION bindings/python/dist/ bindings/python/doc/cntk.*.rst bindings/python/doc/cntk.rst @@ -332,7 +333,8 @@ Manual/.ipynb_checkpoints Examples/Text/LightRNN/LightRNN/*.so # other -/packages +packages/ /CNTK.VC.db /CNTK.VC.VC.opendb /Local +.vs/ diff --git a/CNTK.Common.props b/CNTK.Common.props index 3eeeb3b5d1e..7abf7018555 100644 --- a/CNTK.Common.props +++ b/CNTK.Common.props @@ -30,7 +30,34 @@ false true - 2.3.1 + + + + + 2.4 + + + + false + + + false + true + + $(BUILD_CNTK_VERSION) + true + + + $(CntkVersion) + $(CntkVersionBanner)+ + + + $(CntkVersion) $(CntkComponentVersion)d diff --git a/CNTK.Cpp.props b/CNTK.Cpp.props index 0d33a3f5117..6dadd8f3ad6 100644 --- a/CNTK.Cpp.props +++ b/CNTK.Cpp.props @@ -3,19 +3,10 @@ - 8.0 - 7.5 - - - "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include" - - - - "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib" - + 9.0 %ProgramW6432%\NVIDIA Corporation\NVSMI\nvml.dll - c:\local\bindrop\NVSMI\nvml.dll + c:\local\nvsmi9\NVSMI\nvml.dll false true @@ -65,16 +56,22 @@ MKL - $(MKLML_PATH)\include + $(MKL_PATH)\include USE_MKL Cntk.PerformanceProfiler-$(CntkComponentVersion).lib;$(ReaderLibs) - MKL-ML Library - $(MKLML_PATH)\lib + MKL Library + $(MKL_PATH)\lib mklml.lib mklml.dll $(MathLibraryPath)\*.dll - $(OutDir)mklml.lib;$(OutDir)libiomp5md.dll; + false + + $(MathLinkLibrary);mkldnn.lib + $(MathDelayLoad);mkldnn.dll $(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include; @@ -109,31 +106,19 @@ libprotobufd.lib - - $(CUDA_PATH_V8_0) - cudart64_80.dll - cublas64_80.dll;cusparse64_80.dll;curand64_80.dll;$(CudaRuntimeDll) + + $(CUDA_PATH_V9_0) + cudart64_90.dll + cublas64_90.dll;cusparse64_90.dll;curand64_90.dll;$(CudaRuntimeDll) $(CNTK_CUDA_CODEGEN_DEBUG) compute_30,sm_30 - - $(CNTK_CUDA_CODEGEN_RELEASE) - compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61 - - - - $(CUDA_PATH_V7_5) - cudart64_75.dll - cublas64_75.dll;cusparse64_75.dll;curand64_75.dll;$(CudaRuntimeDll) - $(CNTK_CUDA_CODEGEN_DEBUG) - compute_30,sm_30 - $(CNTK_CUDA_CODEGEN_RELEASE) - compute_30,sm_30;compute_35,sm_35;compute_50,sm_50 + compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70 @@ -144,11 +129,21 @@ $(VCTargetsPath)\BuildCustomizations + + v141 + + + + $(HALIDE_PATH) + $(HALIDE_PATH)\include; + $(HALIDE_PATH)\Release; + halide.lib + + $(DebugBuild) - v140 Unicode $(ReleaseBuild) $(DebugBuild) @@ -156,9 +151,10 @@ - CNTK_COMPONENT_VERSION="$(CntkComponentVersion)" + CNTK_VERSION="$(CntkVersion)";CNTK_VERSION_BANNER="$(CntkVersionBanner)";CNTK_COMPONENT_VERSION="$(CntkComponentVersion)" %(PreprocessorDefinitions);HAS_MPI=1 + %(PreprocessorDefinitions);CUDA_NO_HALF;__CUDA_NO_HALF_OPERATORS__ diff --git a/CNTK.sln b/CNTK.sln index 2787d019300..6d11d62dd8f 100644 --- a/CNTK.sln +++ b/CNTK.sln @@ -1,21 +1,8 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 14 -VisualStudioVersion = 14.0.25420.1 +# Visual Studio 15 +VisualStudioVersion = 15.0.27130.2024 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "Source\CNTK\CNTK.vcxproj", "{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}" - ProjectSection(ProjectDependencies) = postProject - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} - {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} - {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} - {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{D45DF403-6781-444E-B654-A96868C5BE68}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Reader Plugins", "Reader Plugins", "{33EBFE78-A1A8-4961-8938-92A271941F94}" @@ -130,14 +117,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{19EE975B-2 Tests\EndToEndTests\Speech\DNN\DiscriminativePreTraining\macros.txt = Tests\EndToEndTests\Speech\DNN\DiscriminativePreTraining\macros.txt EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputationNetworkLib", "Source\ComputationNetworkLib\ComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SGDLib", "Source\SGDLib\SGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}" - ProjectSection(ProjectDependencies) = postProject - {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} - {16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{5E666C53-2D82-49C9-9127-3FDDC321C741}" ProjectSection(SolutionItems) = preProject Tests\EndToEndTests\ParallelTraining\SimpleMultiGPU.cntk = Tests\EndToEndTests\ParallelTraining\SimpleMultiGPU.cntk @@ -208,8 +187,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Kaldi2Reader", "Kaldi2Reade Source\Readers\Kaldi2Reader\utterancesourcemulti.h = Source\Readers\Kaldi2Reader\utterancesourcemulti.h EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SequenceTrainingLib", "Source\SequenceTrainingLib\SequenceTrainingLib.vcxproj", "{EAD17188-072C-4726-B840-A769C36DAD1B}" -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Truncated", "Truncated", "{88F85A64-105D-4CDA-8199-B7A312FC8A27}" ProjectSection(SolutionItems) = preProject Tests\EndToEndTests\Speech\LSTM\Truncated\baseline.cpu.txt = Tests\EndToEndTests\Speech\LSTM\Truncated\baseline.cpu.txt @@ -272,17 +249,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DiscriminativePreTraining", EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "UnitTests", "UnitTests", "{6F19321A-65E7-4829-B00C-3886CD6C6EDE}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathTests", "Tests\UnitTests\MathTests\MathTests.vcxproj", "{4701E678-5E6F-470D-B348-9CD1A2C095D1}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ActionsLib", "Source\ActionsLib\ActionsLib.vcxproj", "{EB2BE26F-6BD4-4274-971F-86D080779DD1}" - ProjectSection(ProjectDependencies) = postProject - {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceTraining", "SequenceTraining", "{BB8B9FC5-C4B3-477F-80E2-665DC8E431BD}" ProjectSection(SolutionItems) = preProject Tests\EndToEndTests\Speech\DNN\SequenceTraining\add_layer.mel = Tests\EndToEndTests\Speech\DNN\SequenceTraining\add_layer.mel @@ -347,94 +313,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Image", "Image", "{9BDFA4BE EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Speech", "Speech", "{3CE841C0-02E5-46DB-B401-6F8784880173}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ReaderTests", "Tests\UnitTests\ReaderTests\ReaderTests.vcxproj", "{A4FC3467-4787-43E8-BBC0-D79AE56B468D}" - ProjectSection(ProjectDependencies) = postProject - {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} - {33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5} - {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} - {7FE16CBE-B717-45C9-97FB-FA3191039568} = {7FE16CBE-B717-45C9-97FB-FA3191039568} - {7B7A51ED-AA8E-4660-A805-D50235A02120} = {7B7A51ED-AA8E-4660-A805-D50235A02120} - {E6646FFE-3588-4276-8A15-8D65C22711C1} = {E6646FFE-3588-4276-8A15-8D65C22711C1} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalDll", "Source\EvalDll\EvalDll.vcxproj", "{482999D1-B7E2-466E-9F8D-2119F93EAFD9}" - ProjectSection(ProjectDependencies) = postProject - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageWriterDll", "Source\ImageWriterDll\ImageWriterDll.vcxproj", "{2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}" - ProjectSection(ProjectDependencies) = postProject - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Math", "Source\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}" - ProjectSection(ProjectDependencies) = postProject - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathCUDA", "Source\Math\MathCUDA.vcxproj", "{B3DD765E-694E-4494-BAD7-37BBF2942517}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibSVMBinaryReader", "Source\Readers\LibSVMBinaryReader\LibSVMBinaryReader.vcxproj", "{D667AF32-028A-4A5D-BE19-F46776F0F6B2}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryReader", "Source\Readers\BinaryReader\BinaryReader.vcxproj", "{1D5787D4-52E4-45DB-951B-82F220EE0C6A}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DSSMReader", "Source\Readers\DSSMReader\DSSMReader.vcxproj", "{014DA766-B37B-4581-BC26-963EA5507931}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKMLFReader", "Source\Readers\HTKMLFReader\HTKMLFReader.vcxproj", "{33D2FD22-DEF2-4507-A58A-368F641AEBE5}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LMSequenceReader", "Source\Readers\LMSequenceReader\LMSequenceReader.vcxproj", "{9A2F2441-5972-4EA8-9215-4119FCE0FB68}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LUSequenceReader", "Source\Readers\LUSequenceReader\LUSequenceReader.vcxproj", "{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SparsePCReader", "Source\Readers\SparsePCReader\SparsePCReader.vcxproj", "{CE429AA2-3778-4619-8FD1-49BA3B81197B}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UCIFastReader", "Source\Readers\UCIFastReader\UCIFastReader.vcxproj", "{E6646FFE-3588-4276-8A15-8D65C22711C1}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathPerformanceTests", "Tests\UnitTests\MathPerformanceTests\MathPerformanceTests.vcxproj", "{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "EndToEndTests", "EndToEndTests", "{6E565B48-1923-49CE-9787-9BBB9D96F4C5}" ProjectSection(SolutionItems) = preProject Tests\EndToEndTests\run-test-common = Tests\EndToEndTests\run-test-common @@ -563,20 +441,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tools", "Tools", "{83BFF5BF EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Extensibility", "Extensibility", "{60F87E25-BC87-4782-8E20-1621AAEBB113}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalWrapper", "Source\Extensibility\EvalWrapper\EvalWrapper.vcxproj", "{EF766CAE-9CB1-494C-9153-0030631A6340}" - ProjectSection(ProjectDependencies) = postProject - {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Examples", "Examples", "{BD46CE02-3740-4526-80F6-CC7973B953E5}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Speech", "Speech", "{FB7AF7B9-6BEA-459F-94D9-94D53916D2B6}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ReaderLib", "Source\Readers\ReaderLib\ReaderLib.vcxproj", "{F0A9637C-20DA-42F0-83D4-23B4704DE602}" - ProjectSection(ProjectDependencies) = postProject - {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "AN4", "AN4", "{AC7BA8D3-B4C8-42A4-8507-B359BB6D49E8}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "FeedForward", "FeedForward", "{A17AC914-C539-4B47-A80F-9BD25C64E2A0}" @@ -603,32 +471,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{9F1F9C7C-2 Tests\EndToEndTests\Examples\Speech\AN4\LSTM\testcases.yml = Tests\EndToEndTests\Examples\Speech\AN4\LSTM\testcases.yml EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKBinaryReader", "Source\Readers\CNTKBinaryReader\CNTKBinaryReader.vcxproj", "{7FE16CBE-B717-45C9-97FB-FA3191039568}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKTextFormatReader", "Source\Readers\CNTKTextFormatReader\CNTKTextFormatReader.vcxproj", "{91973E60-A7BE-4C86-8FDB-59C88A0B3715}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKDeserializers", "Source\Readers\HTKDeserializers\HTKDeserializers.vcxproj", "{7B7A51ED-AA8E-4660-A805-D50235A02120}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageReader", "Source\Readers\ImageReader\ImageReader.vcxproj", "{9BD0A711-0BBD-45B6-B81C-053F03C26CFB}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ModelExport", "ModelExport", "{08A05A9A-4E45-42D5-83FA-719E99C04A30}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Model0", "Model0", "{715C0E2D-6FF6-4B26-9E49-1C68920CFAF6}" @@ -662,17 +504,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "AlexNet", "AlexNet", "{2B10 Tests\EndToEndTests\Image\AlexNet\val_map.txt = Tests\EndToEndTests\Image\AlexNet\val_map.txt EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NetworkTests", "Tests\UnitTests\NetworkTests\NetworkTests.vcxproj", "{CDA96AA3-3252-4978-A0BF-2ACD670823CB}" - ProjectSection(ProjectDependencies) = postProject - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} - {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Text", "Text", "{8656B71D-E24C-4AC2-8BE4-C07B415A3E15}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceClassification", "SequenceClassification", "{E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}" @@ -716,15 +547,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{181664AC-4C9 Tests\EndToEndTests\Text\SLU\testcases.yml = Tests\EndToEndTests\Text\SLU\testcases.yml EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Common", "Source\Common\Common.vcxproj", "{86883653-8A61-4038-81A0-2379FAE4200A}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CompositeDataReader", "Source\Readers\CompositeDataReader\CompositeDataReader.vcxproj", "{7B7A563D-AA8E-4660-A805-D50235A02120}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SparseDSSM", "SparseDSSM", "{1FB54750-B668-4AC3-966F-ED504020AC06}" ProjectSection(SolutionItems) = preProject Tests\EndToEndTests\Text\SparseDSSM\baseline.cpu.txt = Tests\EndToEndTests\Text\SparseDSSM\baseline.cpu.txt @@ -838,40 +660,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript" Examples\SequenceToSequence\CMUDict\BrainScript\G2P.cntk = Examples\SequenceToSequence\CMUDict\BrainScript\G2P.cntk EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalTests", "Tests\UnitTests\EvalTests\EvalTests.vcxproj", "{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}" - ProjectSection(ProjectDependencies) = postProject - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CommandEval", "Tests\UnitTests\CommandEval\CommandEval.vcxproj", "{731312A8-6DA3-4841-AFCD-57520BA1BF8E}" - ProjectSection(ProjectDependencies) = postProject - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKv2LibraryDll", "Source\CNTKv2LibraryDll\CNTKv2LibraryDll.vcxproj", "{E5606ECE-48CA-4464-BB12-09D81D02B9EF}" - ProjectSection(ProjectDependencies) = postProject - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} - {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D} = {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryTests", "Tests\UnitTests\V2LibraryTests\V2LibraryTests.vcxproj", "{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}" - ProjectSection(ProjectDependencies) = postProject - {33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5} - {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} - {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} - {E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {E6F26F9A-FF64-4F0A-B749-CD309EE357EE} - {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} - {7B7A51ED-AA8E-4660-A805-D50235A02120} = {7B7A51ED-AA8E-4660-A805-D50235A02120} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Scripts", "Scripts", "{68263A2F-1D5F-4C46-B5AF-2304B80FC3D4}" ProjectSection(SolutionItems) = preProject Scripts\pytest.ini = Scripts\pytest.ini @@ -879,19 +667,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Scripts", "Scripts", "{6826 Scripts\uci2ctf.py = Scripts\uci2ctf.py EndProjectSection EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ManagedEvalTests", "Tests\UnitTests\ManagedEvalTests\ManagedEvalTests.csproj", "{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}" - ProjectSection(ProjectDependencies) = postProject - {EF766CAE-9CB1-494C-9153-0030631A6340} = {EF766CAE-9CB1-494C-9153-0030631A6340} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}" - ProjectSection(ProjectDependencies) = postProject - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} - {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TIMIT", "TIMIT", "{B586AA4C-0BB9-4629-9EDA-25FF2618AC9F}" EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TrainSimpleNetwork", "TrainSimpleNetwork", "{C2102C39-BF5F-4B12-9C41-849D1ED35EE8}" @@ -940,16 +715,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Truncated-Kaldi", "Truncate EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "EvalClientTests", "EvalClientTests", "{05E45AF7-C069-4057-BC16-0A532D068CE4}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalClientTest\CPPEvalClientTest.vcxproj", "{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}" - ProjectSection(ProjectDependencies) = postProject - {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} - EndProjectSection -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClientTest", "Tests\EndToEndTests\EvalClientTests\CSEvalClientTest\CSEvalClientTest.csproj", "{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}" - ProjectSection(ProjectDependencies) = postProject - {EF766CAE-9CB1-494C-9153-0030631A6340} = {EF766CAE-9CB1-494C-9153-0030631A6340} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "IRMetric", "IRMetric", "{E844AB9A-A48F-4A99-9625-F528C5C46D83}" ProjectSection(SolutionItems) = preProject Tests\EndToEndTests\Text\IRMetric\baseline.linux.txt = Tests\EndToEndTests\Text\IRMetric\baseline.linux.txt @@ -959,31 +724,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "IRMetric", "IRMetric", "{E8 Tests\EndToEndTests\Text\IRMetric\testcases.yml = Tests\EndToEndTests\Text\IRMetric\testcases.yml EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PythonBindings", "bindings\python\PythonBindings.vcxproj", "{CD721536-CFD3-413E-A3D7-FB0FAF989635}" - ProjectSection(ProjectDependencies) = postProject - {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} - {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} - {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} - {20DEE94F-2802-40B1-B88B-22755A03AA48} = {20DEE94F-2802-40B1-B88B-22755A03AA48} - {4CF94A50-0D17-432A-8B5A-8458E91C44A6} = {4CF94A50-0D17-432A-8B5A-8458E91C44A6} - {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} - {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} - {1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "Source\Multiverso\src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MultiversoTests", "Source\Multiverso\Test\unittests\MultiversoTests.vcxproj", "{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}" - ProjectSection(ProjectDependencies) = postProject - {16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849} - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}" - ProjectSection(ProjectDependencies) = postProject - {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tutorials", "Tutorials", "{305456F0-D9DE-4452-87BE-1C9F3C34C14F}" ProjectSection(SolutionItems) = preProject Tutorials\CNTK_101_LogisticRegression.ipynb = Tutorials\CNTK_101_LogisticRegression.ipynb @@ -1429,16 +1169,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "RNN", "RNN", "{6730F9BE-92A EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CSharp", "CSharp", "{1526F027-B007-472D-82E2-5A91340F3B62}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCSBinding", "bindings\csharp\Swig\CNTKLibraryCSBinding.vcxproj", "{277EBD9D-2504-49FA-AC72-59D5515130C3}" - ProjectSection(ProjectDependencies) = postProject - {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} - EndProjectSection -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryManagedDll", "bindings\csharp\CNTKLibraryManagedDll\CNTKLibraryManagedDll.csproj", "{50EF9EE6-5018-453E-A063-F77044EF1A97}" - ProjectSection(ProjectDependencies) = postProject - {277EBD9D-2504-49FA-AC72-59D5515130C3} = {277EBD9D-2504-49FA-AC72-59D5515130C3} - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Video", "Video", "{2A95B23C-D91E-4DF9-B8F0-5E997608AB65}" ProjectSection(ProjectDependencies) = postProject {277EBD9D-2504-49FA-AC72-59D5515130C3} = {277EBD9D-2504-49FA-AC72-59D5515130C3} @@ -1454,14 +1184,361 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "UCF11", "UCF11", "{5EDBCD1A Examples\Video\DataSets\UCF11\ucf11_utils.py = Examples\Video\DataSets\UCF11\ucf11_utils.py EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GettingStarted", "GettingStarted", "{39C3C8CA-9A8A-4733-ADBB-3E19D0F52528}" - ProjectSection(SolutionItems) = preProject - Examples\Video\GettingStarted\README.md = Examples\Video\GettingStarted\README.md +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GettingStarted", "GettingStarted", "{39C3C8CA-9A8A-4733-ADBB-3E19D0F52528}" + ProjectSection(SolutionItems) = preProject + Examples\Video\GettingStarted\README.md = Examples\Video\GettingStarted\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Python", "Python", "{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B}" + ProjectSection(SolutionItems) = preProject + Examples\Video\GettingStarted\Python\Conv3D_UCF11.py = Examples\Video\GettingStarted\Python\Conv3D_UCF11.py + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "V2LibraryTests", "V2LibraryTests", "{43ED3FD0-824C-4201-BD96-B824DF959ADC}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GoogLeNet", "GoogLeNet", "{789B4AB8-40F1-4A37-823A-BC20D80C8BF1}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Classification\GoogLeNet\README.md = Examples\Image\Classification\GoogLeNet\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BN-Inception", "BN-Inception", "{CE223840-1DEE-4849-B530-F06BEE05BAA8}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Classification\GoogLeNet\BN-Inception\README.md = Examples\Image\Classification\GoogLeNet\BN-Inception\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "InceptionV3", "InceptionV3", "{824766FA-759A-4466-9C39-13200D2D3159}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Classification\GoogLeNet\InceptionV3\README.md = Examples\Image\Classification\GoogLeNet\InceptionV3\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript", "{BD07C9F3-B10C-4C21-82BC-4F249B65DDFE}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionBlocks.bs = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionBlocks.bs + Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.bs = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.bs + Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.cntk = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.cntk + Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\README.md = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript", "{5CC403B9-2405-4FFB-A73B-DAE0DC986C76}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.bs + Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.cntk = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.cntk + Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Extensibility", "Extensibility", "{3BF56127-6F0F-41CF-BFCE-31165A0A5E73}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CPP", "CPP", "{7A27E076-296E-41A8-BA76-164071251372}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "FCN", "FCN", "{58E3A257-91BE-4DC7-8991-70BFABE0A671}" + ProjectSection(SolutionItems) = preProject + Tests\EndToEndTests\Image\FCN\baseline.txt = Tests\EndToEndTests\Image\FCN\baseline.txt + Tests\EndToEndTests\Image\FCN\fcn.cntk = Tests\EndToEndTests\Image\FCN\fcn.cntk + Tests\EndToEndTests\Image\FCN\fcn8_to_fcn4.mel = Tests\EndToEndTests\Image\FCN\fcn8_to_fcn4.mel + Tests\EndToEndTests\Image\FCN\prepare_for_test.mel = Tests\EndToEndTests\Image\FCN\prepare_for_test.mel + Tests\EndToEndTests\Image\FCN\run-test = Tests\EndToEndTests\Image\FCN\run-test + Tests\EndToEndTests\Image\FCN\shared.bs = Tests\EndToEndTests\Image\FCN\shared.bs + Tests\EndToEndTests\Image\FCN\testcases.yml = Tests\EndToEndTests\Image\FCN\testcases.yml + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SaveBestModelPerCriterion", "SaveBestModelPerCriterion", "{C1189678-4FFA-4258-971F-3262B44FCA99}" + ProjectSection(SolutionItems) = preProject + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.cpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.cpu.txt + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.gpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.gpu.txt + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.cpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.cpu.txt + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.gpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.gpu.txt + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\cntkcv.cntk = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\cntkcv.cntk + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\run-test = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\run-test + Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\testcases.yml = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\testcases.yml + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Java", "Java", "{F37067BD-8BB1-4F93-AEF4-F37434613AE4}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "selectivesearch", "selectivesearch", "{BEF04803-47B4-4322-B9D7-E10A8468E79F}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Detection\FastRCNN\selectivesearch\__init__.py = Examples\Image\Detection\FastRCNN\selectivesearch\__init__.py + Examples\Image\Detection\FastRCNN\selectivesearch\README.md = Examples\Image\Detection\FastRCNN\selectivesearch\README.md + Examples\Image\Detection\FastRCNN\selectivesearch\selectivesearch.py = Examples\Image\Detection\FastRCNN\selectivesearch\selectivesearch.py + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "fastRCNN", "fastRCNN", "{C28E4FD7-F9A9-4473-8E5D-D209AF36A1E7}" + ProjectSection(SolutionItems) = preProject + Examples\Image\Detection\FastRCNN\fastRCNN\__init__.py = Examples\Image\Detection\FastRCNN\fastRCNN\__init__.py + Examples\Image\Detection\FastRCNN\fastRCNN\imdb.py = Examples\Image\Detection\FastRCNN\fastRCNN\imdb.py + Examples\Image\Detection\FastRCNN\fastRCNN\nms.py = Examples\Image\Detection\FastRCNN\fastRCNN\nms.py + Examples\Image\Detection\FastRCNN\fastRCNN\pascal_voc.py = Examples\Image\Detection\FastRCNN\fastRCNN\pascal_voc.py + Examples\Image\Detection\FastRCNN\fastRCNN\test.py = Examples\Image\Detection\FastRCNN\fastRCNN\test.py + Examples\Image\Detection\FastRCNN\fastRCNN\timer.py = Examples\Image\Detection\FastRCNN\fastRCNN\timer.py + Examples\Image\Detection\FastRCNN\fastRCNN\train_svms.py = Examples\Image\Detection\FastRCNN\fastRCNN\train_svms.py + Examples\Image\Detection\FastRCNN\fastRCNN\voc_eval.py = Examples\Image\Detection\FastRCNN\fastRCNN\voc_eval.py + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CNTKv2CSharp", "CNTKv2CSharp", "{B3B46744-DBB5-42C2-BAD7-9151D9486045}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ProposalLayer", "ProposalLayer", "{3631994A-59E6-4CD6-99A4-6D332F8DABE2}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "Source\CNTK\CNTK.vcxproj", "{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}" + ProjectSection(ProjectDependencies) = postProject + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} + {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} + {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} + {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputationNetworkLib", "Source\ComputationNetworkLib\ComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SGDLib", "Source\SGDLib\SGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}" + ProjectSection(ProjectDependencies) = postProject + {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} + {16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SequenceTrainingLib", "Source\SequenceTrainingLib\SequenceTrainingLib.vcxproj", "{EAD17188-072C-4726-B840-A769C36DAD1B}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathTests", "Tests\UnitTests\MathTests\MathTests.vcxproj", "{4701E678-5E6F-470D-B348-9CD1A2C095D1}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ActionsLib", "Source\ActionsLib\ActionsLib.vcxproj", "{EB2BE26F-6BD4-4274-971F-86D080779DD1}" + ProjectSection(ProjectDependencies) = postProject + {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ReaderTests", "Tests\UnitTests\ReaderTests\ReaderTests.vcxproj", "{A4FC3467-4787-43E8-BBC0-D79AE56B468D}" + ProjectSection(ProjectDependencies) = postProject + {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} + {33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5} + {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} + {7FE16CBE-B717-45C9-97FB-FA3191039568} = {7FE16CBE-B717-45C9-97FB-FA3191039568} + {7B7A51ED-AA8E-4660-A805-D50235A02120} = {7B7A51ED-AA8E-4660-A805-D50235A02120} + {E6646FFE-3588-4276-8A15-8D65C22711C1} = {E6646FFE-3588-4276-8A15-8D65C22711C1} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalDll", "Source\EvalDll\EvalDll.vcxproj", "{482999D1-B7E2-466E-9F8D-2119F93EAFD9}" + ProjectSection(ProjectDependencies) = postProject + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Math", "Source\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}" + ProjectSection(ProjectDependencies) = postProject + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathCUDA", "Source\Math\MathCUDA.vcxproj", "{B3DD765E-694E-4494-BAD7-37BBF2942517}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibSVMBinaryReader", "Source\Readers\LibSVMBinaryReader\LibSVMBinaryReader.vcxproj", "{D667AF32-028A-4A5D-BE19-F46776F0F6B2}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryReader", "Source\Readers\BinaryReader\BinaryReader.vcxproj", "{1D5787D4-52E4-45DB-951B-82F220EE0C6A}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DSSMReader", "Source\Readers\DSSMReader\DSSMReader.vcxproj", "{014DA766-B37B-4581-BC26-963EA5507931}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKMLFReader", "Source\Readers\HTKMLFReader\HTKMLFReader.vcxproj", "{33D2FD22-DEF2-4507-A58A-368F641AEBE5}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LMSequenceReader", "Source\Readers\LMSequenceReader\LMSequenceReader.vcxproj", "{9A2F2441-5972-4EA8-9215-4119FCE0FB68}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LUSequenceReader", "Source\Readers\LUSequenceReader\LUSequenceReader.vcxproj", "{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SparsePCReader", "Source\Readers\SparsePCReader\SparsePCReader.vcxproj", "{CE429AA2-3778-4619-8FD1-49BA3B81197B}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UCIFastReader", "Source\Readers\UCIFastReader\UCIFastReader.vcxproj", "{E6646FFE-3588-4276-8A15-8D65C22711C1}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathPerformanceTests", "Tests\UnitTests\MathPerformanceTests\MathPerformanceTests.vcxproj", "{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalWrapper", "Source\Extensibility\EvalWrapper\EvalWrapper.vcxproj", "{EF766CAE-9CB1-494C-9153-0030631A6340}" + ProjectSection(ProjectDependencies) = postProject + {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ReaderLib", "Source\Readers\ReaderLib\ReaderLib.vcxproj", "{F0A9637C-20DA-42F0-83D4-23B4704DE602}" + ProjectSection(ProjectDependencies) = postProject + {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKBinaryReader", "Source\Readers\CNTKBinaryReader\CNTKBinaryReader.vcxproj", "{7FE16CBE-B717-45C9-97FB-FA3191039568}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKTextFormatReader", "Source\Readers\CNTKTextFormatReader\CNTKTextFormatReader.vcxproj", "{91973E60-A7BE-4C86-8FDB-59C88A0B3715}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKDeserializers", "Source\Readers\HTKDeserializers\HTKDeserializers.vcxproj", "{7B7A51ED-AA8E-4660-A805-D50235A02120}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageReader", "Source\Readers\ImageReader\ImageReader.vcxproj", "{9BD0A711-0BBD-45B6-B81C-053F03C26CFB}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NetworkTests", "Tests\UnitTests\NetworkTests\NetworkTests.vcxproj", "{CDA96AA3-3252-4978-A0BF-2ACD670823CB}" + ProjectSection(ProjectDependencies) = postProject + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} + {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Common", "Source\Common\Common.vcxproj", "{86883653-8A61-4038-81A0-2379FAE4200A}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CompositeDataReader", "Source\Readers\CompositeDataReader\CompositeDataReader.vcxproj", "{7B7A563D-AA8E-4660-A805-D50235A02120}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalTests", "Tests\UnitTests\EvalTests\EvalTests.vcxproj", "{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}" + ProjectSection(ProjectDependencies) = postProject + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CommandEval", "Tests\UnitTests\CommandEval\CommandEval.vcxproj", "{731312A8-6DA3-4841-AFCD-57520BA1BF8E}" + ProjectSection(ProjectDependencies) = postProject + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKv2LibraryDll", "Source\CNTKv2LibraryDll\CNTKv2LibraryDll.vcxproj", "{E5606ECE-48CA-4464-BB12-09D81D02B9EF}" + ProjectSection(ProjectDependencies) = postProject + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602} + {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryTests", "Tests\UnitTests\V2LibraryTests\V2LibraryTests.vcxproj", "{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}" + ProjectSection(ProjectDependencies) = postProject + {33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5} + {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} + {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} + {E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {E6F26F9A-FF64-4F0A-B749-CD309EE357EE} + {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} + {7B7A51ED-AA8E-4660-A805-D50235A02120} = {7B7A51ED-AA8E-4660-A805-D50235A02120} + EndProjectSection +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ManagedEvalTests", "Tests\UnitTests\ManagedEvalTests\ManagedEvalTests.csproj", "{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}" + ProjectSection(ProjectDependencies) = postProject + {EF766CAE-9CB1-494C-9153-0030631A6340} = {EF766CAE-9CB1-494C-9153-0030631A6340} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}" + ProjectSection(ProjectDependencies) = postProject + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + {EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalClientTest\CPPEvalClientTest.vcxproj", "{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}" + ProjectSection(ProjectDependencies) = postProject + {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} + EndProjectSection +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClientTest", "Tests\EndToEndTests\EvalClientTests\CSEvalClientTest\CSEvalClientTest.csproj", "{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}" + ProjectSection(ProjectDependencies) = postProject + {EF766CAE-9CB1-494C-9153-0030631A6340} = {EF766CAE-9CB1-494C-9153-0030631A6340} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PythonBindings", "bindings\python\PythonBindings.vcxproj", "{CD721536-CFD3-413E-A3D7-FB0FAF989635}" + ProjectSection(ProjectDependencies) = postProject + {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} + {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {4B442D34-641A-4B37-9A4B-D18DBE28A979} + {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} + {20DEE94F-2802-40B1-B88B-22755A03AA48} = {20DEE94F-2802-40B1-B88B-22755A03AA48} + {4CF94A50-0D17-432A-8B5A-8458E91C44A6} = {4CF94A50-0D17-432A-8B5A-8458E91C44A6} + {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} + {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} + {1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "Source\Multiverso\src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MultiversoTests", "Source\Multiverso\Test\unittests\MultiversoTests.vcxproj", "{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}" + ProjectSection(ProjectDependencies) = postProject + {16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}" + ProjectSection(ProjectDependencies) = postProject + {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCSBinding", "bindings\csharp\Swig\CNTKLibraryCSBinding.vcxproj", "{277EBD9D-2504-49FA-AC72-59D5515130C3}" + ProjectSection(ProjectDependencies) = postProject + {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Python", "Python", "{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B}" - ProjectSection(SolutionItems) = preProject - Examples\Video\GettingStarted\Python\Conv3D_UCF11.py = Examples\Video\GettingStarted\Python\Conv3D_UCF11.py +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryManagedDll", "bindings\csharp\CNTKLibraryManagedDll\CNTKLibraryManagedDll.csproj", "{50EF9EE6-5018-453E-A063-F77044EF1A97}" + ProjectSection(ProjectDependencies) = postProject + {277EBD9D-2504-49FA-AC72-59D5515130C3} = {277EBD9D-2504-49FA-AC72-59D5515130C3} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PerformanceProfilerDll", "Source\PerformanceProfilerDll\PerformanceProfilerDll.vcxproj", "{4B442D34-641A-4B37-9A4B-D18DBE28A979}" @@ -1474,38 +1551,6 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalExamplesTe {50EF9EE6-5018-453E-A063-F77044EF1A97} = {50EF9EE6-5018-453E-A063-F77044EF1A97} EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "V2LibraryTests", "V2LibraryTests", "{43ED3FD0-824C-4201-BD96-B824DF959ADC}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GoogLeNet", "GoogLeNet", "{789B4AB8-40F1-4A37-823A-BC20D80C8BF1}" - ProjectSection(SolutionItems) = preProject - Examples\Image\Classification\GoogLeNet\README.md = Examples\Image\Classification\GoogLeNet\README.md - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BN-Inception", "BN-Inception", "{CE223840-1DEE-4849-B530-F06BEE05BAA8}" - ProjectSection(SolutionItems) = preProject - Examples\Image\Classification\GoogLeNet\BN-Inception\README.md = Examples\Image\Classification\GoogLeNet\BN-Inception\README.md - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "InceptionV3", "InceptionV3", "{824766FA-759A-4466-9C39-13200D2D3159}" - ProjectSection(SolutionItems) = preProject - Examples\Image\Classification\GoogLeNet\InceptionV3\README.md = Examples\Image\Classification\GoogLeNet\InceptionV3\README.md - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript", "{BD07C9F3-B10C-4C21-82BC-4F249B65DDFE}" - ProjectSection(SolutionItems) = preProject - Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionBlocks.bs = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionBlocks.bs - Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.bs = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.bs - Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.cntk = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.cntk - Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\README.md = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\README.md - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript", "{5CC403B9-2405-4FFB-A73B-DAE0DC986C76}" - ProjectSection(SolutionItems) = preProject - Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.bs - Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.cntk = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.cntk - Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs - EndProjectSection -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}" ProjectSection(ProjectDependencies) = postProject {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715} @@ -1519,10 +1564,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryEndToEndTests", "T {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Extensibility", "Extensibility", "{3BF56127-6F0F-41CF-BFCE-31165A0A5E73}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CPP", "CPP", "{7A27E076-296E-41A8-BA76-164071251372}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPExtensibilityExamples", "Examples\Extensibility\CPP\CPPExtensibilityExamples.vcxproj", "{40A8CC31-8C08-4156-AE08-E8C0FADC3509}" ProjectSection(ProjectDependencies) = postProject {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} @@ -1530,64 +1571,24 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPExtensibilityExamples", EndProject Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonExamples", "Examples\PythonExamples.pyproj", "{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "FCN", "FCN", "{58E3A257-91BE-4DC7-8991-70BFABE0A671}" - ProjectSection(SolutionItems) = preProject - Tests\EndToEndTests\Image\FCN\baseline.txt = Tests\EndToEndTests\Image\FCN\baseline.txt - Tests\EndToEndTests\Image\FCN\fcn.cntk = Tests\EndToEndTests\Image\FCN\fcn.cntk - Tests\EndToEndTests\Image\FCN\fcn8_to_fcn4.mel = Tests\EndToEndTests\Image\FCN\fcn8_to_fcn4.mel - Tests\EndToEndTests\Image\FCN\prepare_for_test.mel = Tests\EndToEndTests\Image\FCN\prepare_for_test.mel - Tests\EndToEndTests\Image\FCN\run-test = Tests\EndToEndTests\Image\FCN\run-test - Tests\EndToEndTests\Image\FCN\shared.bs = Tests\EndToEndTests\Image\FCN\shared.bs - Tests\EndToEndTests\Image\FCN\testcases.yml = Tests\EndToEndTests\Image\FCN\testcases.yml - EndProjectSection -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPExtensibilityExamplesLibrary", "Examples\Extensibility\CPPLib\CPPExtensibilityExamplesLibrary.vcxproj", "{4CF94A50-0D17-432A-8B5A-8458E91C44A6}" ProjectSection(ProjectDependencies) = postProject {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SaveBestModelPerCriterion", "SaveBestModelPerCriterion", "{C1189678-4FFA-4258-971F-3262B44FCA99}" - ProjectSection(SolutionItems) = preProject - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.cpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.cpu.txt - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.gpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.gpu.txt - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.cpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.cpu.txt - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.gpu.txt = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\baseline.windows.gpu.txt - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\cntkcv.cntk = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\cntkcv.cntk - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\run-test = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\run-test - Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\testcases.yml = Tests\EndToEndTests\Speech\DNN\SaveBestModelPerCriterion\testcases.yml - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Java", "Java", "{F37067BD-8BB1-4F93-AEF4-F37434613AE4}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryJavaBinding", "bindings\java\Swig\CNTKLibraryJavaBinding.vcxproj", "{5D1972FA-F0A4-4035-8E63-8BAEF0230097}" ProjectSection(ProjectDependencies) = postProject {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BinaryConvolution", "BinaryConvolution", "{65649688-3377-4FA9-8CD0-BDC3AC72E0AD}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryConvolutionLib", "Examples\Extensibility\BinaryConvolution\BinaryConvolutionLib\BinaryConvolutionLib.vcxproj", "{20DEE94F-2802-40B1-B88B-22755A03AA48}" ProjectSection(ProjectDependencies) = postProject {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "selectivesearch", "selectivesearch", "{BEF04803-47B4-4322-B9D7-E10A8468E79F}" - ProjectSection(SolutionItems) = preProject - Examples\Image\Detection\FastRCNN\selectivesearch\__init__.py = Examples\Image\Detection\FastRCNN\selectivesearch\__init__.py - Examples\Image\Detection\FastRCNN\selectivesearch\README.md = Examples\Image\Detection\FastRCNN\selectivesearch\README.md - Examples\Image\Detection\FastRCNN\selectivesearch\selectivesearch.py = Examples\Image\Detection\FastRCNN\selectivesearch\selectivesearch.py - EndProjectSection -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "fastRCNN", "fastRCNN", "{C28E4FD7-F9A9-4473-8E5D-D209AF36A1E7}" ProjectSection(SolutionItems) = preProject - Examples\Image\Detection\FastRCNN\fastRCNN\__init__.py = Examples\Image\Detection\FastRCNN\fastRCNN\__init__.py - Examples\Image\Detection\FastRCNN\fastRCNN\imdb.py = Examples\Image\Detection\FastRCNN\fastRCNN\imdb.py - Examples\Image\Detection\FastRCNN\fastRCNN\nms.py = Examples\Image\Detection\FastRCNN\fastRCNN\nms.py - Examples\Image\Detection\FastRCNN\fastRCNN\pascal_voc.py = Examples\Image\Detection\FastRCNN\fastRCNN\pascal_voc.py - Examples\Image\Detection\FastRCNN\fastRCNN\test.py = Examples\Image\Detection\FastRCNN\fastRCNN\test.py - Examples\Image\Detection\FastRCNN\fastRCNN\timer.py = Examples\Image\Detection\FastRCNN\fastRCNN\timer.py - Examples\Image\Detection\FastRCNN\fastRCNN\train_svms.py = Examples\Image\Detection\FastRCNN\fastRCNN\train_svms.py - Examples\Image\Detection\FastRCNN\fastRCNN\voc_eval.py = Examples\Image\Detection\FastRCNN\fastRCNN\voc_eval.py + Examples\Extensibility\BinaryConvolution\binary_convnet.py = Examples\Extensibility\BinaryConvolution\binary_convnet.py + Examples\Extensibility\BinaryConvolution\custom_convolution_ops.py = Examples\Extensibility\BinaryConvolution\custom_convolution_ops.py + Examples\Extensibility\BinaryConvolution\README.md = Examples\Extensibility\BinaryConvolution\README.md EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPUWPEvalExamplesTests", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPUWPEvalExamplesTests\CNTKLibraryCPPUWPEvalExamplesTests.vcxproj", "{D5CB8825-0D1F-4940-9906-9BD87614B24E}" @@ -1600,7 +1601,7 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ImageRecognitionAppCS", "Te {C5E944BA-A7C4-482F-BE01-077A7DFC159C} = {C5E944BA-A7C4-482F-BE01-077A7DFC159C} EndProjectSection EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageRecognitionLib", "Tests\EndToEndTests\EvalClientTests\UWPImageRecognitionTest\ImageRecognizerLib\ImageRecognizerLib.vcxproj", "{C5E944BA-A7C4-482F-BE01-077A7DFC159C}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageRecognizerLib", "Tests\EndToEndTests\EvalClientTests\UWPImageRecognitionTest\ImageRecognizerLib\ImageRecognizerLib.vcxproj", "{C5E944BA-A7C4-482F-BE01-077A7DFC159C}" ProjectSection(ProjectDependencies) = postProject {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection @@ -1610,8 +1611,6 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "V2LibraryCSTests", "Tests\U {50EF9EE6-5018-453E-A063-F77044EF1A97} = {50EF9EE6-5018-453E-A063-F77044EF1A97} EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CNTKv2CSharp", "CNTKv2CSharp", "{B3B46744-DBB5-42C2-BAD7-9151D9486045}" -EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSTrainingTest", "Tests\EndToEndTests\CNTKv2CSharp\CNTKLibraryCSTrainingTest\CNTKLibraryCSTrainingTest.csproj", "{0DF2109B-BB85-4718-82DE-1C0536D4F2C3}" ProjectSection(ProjectDependencies) = postProject {7B7A563D-AA8E-4660-A805-D50235A02120} = {7B7A563D-AA8E-4660-A805-D50235A02120} @@ -1619,13 +1618,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSTrainingTest", {50EF9EE6-5018-453E-A063-F77044EF1A97} = {50EF9EE6-5018-453E-A063-F77044EF1A97} EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ProposalLayer", "ProposalLayer", "{3631994A-59E6-4CD6-99A4-6D332F8DABE2}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ProposalLayerLib", "Examples\Extensibility\ProposalLayer\ProposalLayerLib\ProposalLayerLib.vcxproj", "{91EA9F28-B9B6-4FC7-A47D-9838F5915700}" ProjectSection(ProjectDependencies) = postProject {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF} EndProjectSection EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageWriterDll", "Source\ImageWriterDll\ImageWriterDll.vcxproj", "{2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}" + ProjectSection(ProjectDependencies) = postProject + {86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryConvolutionLib", "Source\Extensibility\BinaryConvolutionLib\BinaryConvolutionLib.vcxproj", "{20DEE94F-2802-40B1-B88B-22755A03AA48}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug_CpuOnly|x64 = Debug_CpuOnly|x64 @@ -1739,18 +1743,6 @@ Global {482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release_UWP|x64.ActiveCfg = Release_CpuOnly|x64 {482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release|x64.ActiveCfg = Release|x64 {482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release|x64.Build.0 = Release|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_UWP|x64.ActiveCfg = Debug_CpuOnly|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug|x64.ActiveCfg = Debug|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug|x64.Build.0 = Debug|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_UWP|x64.ActiveCfg = Release_NoOpt|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release|x64.ActiveCfg = Release|x64 - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release|x64.Build.0 = Release|x64 {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64 {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64 {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug_UWP|x64.ActiveCfg = Debug_UWP|x64 @@ -2255,18 +2247,6 @@ Global {5D1972FA-F0A4-4035-8E63-8BAEF0230097}.Release_UWP|x64.ActiveCfg = Release_CpuOnly|x64 {5D1972FA-F0A4-4035-8E63-8BAEF0230097}.Release|x64.ActiveCfg = Release|x64 {5D1972FA-F0A4-4035-8E63-8BAEF0230097}.Release|x64.Build.0 = Release|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_UWP|x64.ActiveCfg = Debug_CpuOnly|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug|x64.ActiveCfg = Debug|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug|x64.Build.0 = Debug|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_UWP|x64.ActiveCfg = Release_CpuOnly|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release|x64.ActiveCfg = Release|x64 - {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release|x64.Build.0 = Release|x64 {D5CB8825-0D1F-4940-9906-9BD87614B24E}.Debug_CpuOnly|x64.ActiveCfg = Debug_UWP|x64 {D5CB8825-0D1F-4940-9906-9BD87614B24E}.Debug_UWP|x64.ActiveCfg = Debug_UWP|x64 {D5CB8825-0D1F-4940-9906-9BD87614B24E}.Debug_UWP|x64.Build.0 = Debug_UWP|x64 @@ -2334,12 +2314,39 @@ Global {91EA9F28-B9B6-4FC7-A47D-9838F5915700}.Release_UWP|x64.ActiveCfg = Release_CpuOnly|x64 {91EA9F28-B9B6-4FC7-A47D-9838F5915700}.Release|x64.ActiveCfg = Release|x64 {91EA9F28-B9B6-4FC7-A47D-9838F5915700}.Release|x64.Build.0 = Release|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_UWP|x64.ActiveCfg = Debug_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug_UWP|x64.Build.0 = Debug_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug|x64.ActiveCfg = Debug|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Debug|x64.Build.0 = Debug|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_UWP|x64.ActiveCfg = Release_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release_UWP|x64.Build.0 = Release_CpuOnly|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release|x64.ActiveCfg = Release|x64 + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D}.Release|x64.Build.0 = Release|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_UWP|x64.ActiveCfg = Debug_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug_UWP|x64.Build.0 = Debug_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug|x64.ActiveCfg = Debug|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Debug|x64.Build.0 = Debug|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_UWP|x64.ActiveCfg = Release_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release_UWP|x64.Build.0 = Release_CpuOnly|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release|x64.ActiveCfg = Release|x64 + {20DEE94F-2802-40B1-B88B-22755A03AA48}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(NestedProjects) = preSolution - {E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} {3ED0465D-23E7-4855-9694-F788717B6533} = {83BFF5BF-D054-4B3E-9769-B00BA707F116} {EA67F51F-1FE8-462D-9F3E-01161685AD59} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0} {DE1A06BA-EC5C-4E0D-BCA8-3EA555310C58} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0} @@ -2350,15 +2357,12 @@ Global {4BBF2950-3DBD-469A-AD57-6CACBEBAF541} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} {5F733BBA-FE83-4668-8F83-8B0E78A36619} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} {19EE975B-232D-49F0-94C7-6F1C6424FB53} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} - {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} {5E666C53-2D82-49C9-9127-3FDDC321C741} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {6D1353D6-F196-466F-B886-F16D48759B20} = {5E666C53-2D82-49C9-9127-3FDDC321C741} {B6725C9F-A6D2-4269-9B74-7888A90F7884} = {5E666C53-2D82-49C9-9127-3FDDC321C741} {B27DD434-EECD-4EE0-A03B-1150EB87258E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884} {A4884465-CFBB-4A64-A9DE-690E1A63EF7E} = {B6725C9F-A6D2-4269-9B74-7888A90F7884} {C70E1572-20FF-496C-A0A9-10AA6755A07C} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {EAD17188-072C-4726-B840-A769C36DAD1B} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} {88F85A64-105D-4CDA-8199-B7A312FC8A27} = {19EE975B-232D-49F0-94C7-6F1C6424FB53} {8241108A-7824-4FF2-BECA-7521A9D89DCF} = {19EE975B-232D-49F0-94C7-6F1C6424FB53} {6994C86D-A672-4254-824A-51F4DFEB807F} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} @@ -2366,8 +2370,6 @@ Global {40F93928-5CA2-433A-A48C-C8E9A35D7079} = {6994C86D-A672-4254-824A-51F4DFEB807F} {39B9BB97-D0E8-439A-8A1B-8DB8E7CF73C3} = {6994C86D-A672-4254-824A-51F4DFEB807F} {6F19321A-65E7-4829-B00C-3886CD6C6EDE} = {D45DF403-6781-444E-B654-A96868C5BE68} - {4701E678-5E6F-470D-B348-9CD1A2C095D1} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} {BB8B9FC5-C4B3-477F-80E2-665DC8E431BD} = {6994C86D-A672-4254-824A-51F4DFEB807F} {8071EF60-30F7-4A77-81AA-ADCA0E18B1E3} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {76F9323D-34A1-43A5-A594-C4798931FF21} = {8071EF60-30F7-4A77-81AA-ADCA0E18B1E3} @@ -2376,20 +2378,6 @@ Global {81AE014F-DD63-47C7-B6E2-DB1D2833DCD1} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} {9BDFA4BE-790E-408F-915B-5979BB5078C6} = {47755F2E-D674-4175-9E38-8EA053455072} {3CE841C0-02E5-46DB-B401-6F8784880173} = {47755F2E-D674-4175-9E38-8EA053455072} - {A4FC3467-4787-43E8-BBC0-D79AE56B468D} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} {6E565B48-1923-49CE-9787-9BBB9D96F4C5} = {D45DF403-6781-444E-B654-A96868C5BE68} {3BF59CCE-D245-420A-9F17-73CE61E284C2} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {811924DE-2F12-4EA0-BE58-E57BEF3B74D1} = {3BF59CCE-D245-420A-9F17-73CE61E284C2} @@ -2405,29 +2393,20 @@ Global {1C7D222F-E17B-444F-A18C-6205DEEF27BA} = {FB65FA58-C47B-4A49-9566-40FD5D75FC59} {5ED4F5DC-E016-4E10-BACD-6A760A0CDE89} = {FB65FA58-C47B-4A49-9566-40FD5D75FC59} {35CFD8E3-7206-4243-AB5C-AAF610109A5C} = {FB65FA58-C47B-4A49-9566-40FD5D75FC59} - {EF766CAE-9CB1-494C-9153-0030631A6340} = {60F87E25-BC87-4782-8E20-1621AAEBB113} {BD46CE02-3740-4526-80F6-CC7973B953E5} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {FB7AF7B9-6BEA-459F-94D9-94D53916D2B6} = {BD46CE02-3740-4526-80F6-CC7973B953E5} - {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {33EBFE78-A1A8-4961-8938-92A271941F94} {AC7BA8D3-B4C8-42A4-8507-B359BB6D49E8} = {FB7AF7B9-6BEA-459F-94D9-94D53916D2B6} {A17AC914-C539-4B47-A80F-9BD25C64E2A0} = {AC7BA8D3-B4C8-42A4-8507-B359BB6D49E8} {9F1F9C7C-2CC3-410C-ACDC-988B12D6AC14} = {AC7BA8D3-B4C8-42A4-8507-B359BB6D49E8} - {7FE16CBE-B717-45C9-97FB-FA3191039568} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {7B7A51ED-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94} - {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {33EBFE78-A1A8-4961-8938-92A271941F94} {08A05A9A-4E45-42D5-83FA-719E99C04A30} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {715C0E2D-6FF6-4B26-9E49-1C68920CFAF6} = {08A05A9A-4E45-42D5-83FA-719E99C04A30} {48C2A9DE-FB2C-4724-9ADC-744216D79BCF} = {08A05A9A-4E45-42D5-83FA-719E99C04A30} {2B1046A1-0140-43B7-B3DC-CF7DEEE1009E} = {8071EF60-30F7-4A77-81AA-ADCA0E18B1E3} - {CDA96AA3-3252-4978-A0BF-2ACD670823CB} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} {8656B71D-E24C-4AC2-8BE4-C07B415A3E15} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8} {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15} {EC780385-7580-4D15-914B-1D878A295CBC} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1} {D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1} {181664AC-4C95-4798-A923-09B879215B33} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15} - {86883653-8A61-4038-81A0-2379FAE4200A} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {7B7A563D-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94} {1FB54750-B668-4AC3-966F-ED504020AC06} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15} {3E9BD61F-1F0A-4966-BE17-803AEFD1DFA4} = {6994C86D-A672-4254-824A-51F4DFEB807F} {5560DDD4-1E6E-4F41-B9BD-F52A19DF0B31} = {6994C86D-A672-4254-824A-51F4DFEB807F} @@ -2445,13 +2424,7 @@ Global {1141DC61-E014-4DEC-9157-F6B1FC055C7A} = {772A0DB3-4710-4281-8AA9-A9F1F7C543D3} {EC7298E3-AAA9-4672-941F-0B342C494CB3} = {A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26} {ECED747C-86D7-4009-B2A9-0525FE5DF4EB} = {EC7298E3-AAA9-4672-941F-0B342C494CB3} - {82125DA1-1CD7-45B5-9281-E6AE7C287CB7} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {731312A8-6DA3-4841-AFCD-57520BA1BF8E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} {68263A2F-1D5F-4C46-B5AF-2304B80FC3D4} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} - {CC8DDDCB-D53A-4B30-8596-AEF1C493DB31} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {9F999212-AFC5-4EAC-AA78-F7247D46C456} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} {B586AA4C-0BB9-4629-9EDA-25FF2618AC9F} = {FB7AF7B9-6BEA-459F-94D9-94D53916D2B6} {C2102C39-BF5F-4B12-9C41-849D1ED35EE8} = {B586AA4C-0BB9-4629-9EDA-25FF2618AC9F} {4DDFD21D-E82C-4321-B380-8A9382E18107} = {19EE975B-232D-49F0-94C7-6F1C6424FB53} @@ -2459,13 +2432,7 @@ Global {4F534076-39A5-40E2-BF87-64EC464C52B2} = {19EE975B-232D-49F0-94C7-6F1C6424FB53} {31263D7F-F590-475E-B4F0-7DFA4E4FF4B8} = {19EE975B-232D-49F0-94C7-6F1C6424FB53} {05E45AF7-C069-4057-BC16-0A532D068CE4} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} - {CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {05E45AF7-C069-4057-BC16-0A532D068CE4} - {1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {05E45AF7-C069-4057-BC16-0A532D068CE4} {E844AB9A-A48F-4A99-9625-F528C5C46D83} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15} - {CD721536-CFD3-413E-A3D7-FB0FAF989635} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {16F14058-B116-49D9-8BA0-209F3AFFE849} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {EC7157E9-A51F-4702-A5FD-8DAF88C7029F} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4} {48E51BEE-C69E-4739-8ABF-D481040E1FB7} = {305456F0-D9DE-4452-87BE-1C9F3C34C14F} {A22E7B97-B4D2-43EA-AD53-307FA767A38D} = {305456F0-D9DE-4452-87BE-1C9F3C34C14F} {2DD4DF97-4379-4D5F-9C1D-7AAC59E47796} = {305456F0-D9DE-4452-87BE-1C9F3C34C14F} @@ -2523,43 +2490,86 @@ Global {784A839C-762F-4A85-9EF1-A1E00546AD6C} = {D30B34AF-3618-4C55-900E-8F60A9F39E66} {6730F9BE-92AA-45F7-9F98-CD13E725CCA9} = {784A839C-762F-4A85-9EF1-A1E00546AD6C} {1526F027-B007-472D-82E2-5A91340F3B62} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {277EBD9D-2504-49FA-AC72-59D5515130C3} = {1526F027-B007-472D-82E2-5A91340F3B62} - {50EF9EE6-5018-453E-A063-F77044EF1A97} = {1526F027-B007-472D-82E2-5A91340F3B62} {2A95B23C-D91E-4DF9-B8F0-5E997608AB65} = {47755F2E-D674-4175-9E38-8EA053455072} {FB604F98-008F-45CD-B06E-42C30E121F13} = {2A95B23C-D91E-4DF9-B8F0-5E997608AB65} {5EDBCD1A-4F07-4618-84C9-FC6905A438B4} = {FB604F98-008F-45CD-B06E-42C30E121F13} {39C3C8CA-9A8A-4733-ADBB-3E19D0F52528} = {2A95B23C-D91E-4DF9-B8F0-5E997608AB65} {CB4566F1-6C8F-4270-83EE-F6AED84EBB2B} = {39C3C8CA-9A8A-4733-ADBB-3E19D0F52528} - {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {3500A847-E024-4E7D-92DD-CC587C17460B} = {05E45AF7-C069-4057-BC16-0A532D068CE4} {43ED3FD0-824C-4201-BD96-B824DF959ADC} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {789B4AB8-40F1-4A37-823A-BC20D80C8BF1} = {151202CF-C2E4-47A6-A31C-CE039D698519} {CE223840-1DEE-4849-B530-F06BEE05BAA8} = {789B4AB8-40F1-4A37-823A-BC20D80C8BF1} {824766FA-759A-4466-9C39-13200D2D3159} = {789B4AB8-40F1-4A37-823A-BC20D80C8BF1} {BD07C9F3-B10C-4C21-82BC-4F249B65DDFE} = {824766FA-759A-4466-9C39-13200D2D3159} {5CC403B9-2405-4FFB-A73B-DAE0DC986C76} = {CE223840-1DEE-4849-B530-F06BEE05BAA8} - {D771A06D-CC25-4582-B5CD-D2A4782BB005} = {05E45AF7-C069-4057-BC16-0A532D068CE4} - {743FC7AA-3884-4C96-983A-A33FD6C56227} = {43ED3FD0-824C-4201-BD96-B824DF959ADC} {3BF56127-6F0F-41CF-BFCE-31165A0A5E73} = {47755F2E-D674-4175-9E38-8EA053455072} {7A27E076-296E-41A8-BA76-164071251372} = {3BF56127-6F0F-41CF-BFCE-31165A0A5E73} - {40A8CC31-8C08-4156-AE08-E8C0FADC3509} = {7A27E076-296E-41A8-BA76-164071251372} - {292FF4EE-D9DD-4BA7-85F7-6A22148D1E01} = {47755F2E-D674-4175-9E38-8EA053455072} {58E3A257-91BE-4DC7-8991-70BFABE0A671} = {8071EF60-30F7-4A77-81AA-ADCA0E18B1E3} - {4CF94A50-0D17-432A-8B5A-8458E91C44A6} = {7A27E076-296E-41A8-BA76-164071251372} {C1189678-4FFA-4258-971F-3262B44FCA99} = {6994C86D-A672-4254-824A-51F4DFEB807F} {F37067BD-8BB1-4F93-AEF4-F37434613AE4} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} - {5D1972FA-F0A4-4035-8E63-8BAEF0230097} = {F37067BD-8BB1-4F93-AEF4-F37434613AE4} - {65649688-3377-4FA9-8CD0-BDC3AC72E0AD} = {3BF56127-6F0F-41CF-BFCE-31165A0A5E73} - {20DEE94F-2802-40B1-B88B-22755A03AA48} = {65649688-3377-4FA9-8CD0-BDC3AC72E0AD} {BEF04803-47B4-4322-B9D7-E10A8468E79F} = {4EAFF1B2-2D70-4486-B95E-684E39A50609} {C28E4FD7-F9A9-4473-8E5D-D209AF36A1E7} = {4EAFF1B2-2D70-4486-B95E-684E39A50609} + {B3B46744-DBB5-42C2-BAD7-9151D9486045} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} + {3631994A-59E6-4CD6-99A4-6D332F8DABE2} = {3BF56127-6F0F-41CF-BFCE-31165A0A5E73} + {E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {EAD17188-072C-4726-B840-A769C36DAD1B} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {4701E678-5E6F-470D-B348-9CD1A2C095D1} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {EB2BE26F-6BD4-4274-971F-86D080779DD1} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {A4FC3467-4787-43E8-BBC0-D79AE56B468D} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {CE429AA2-3778-4619-8FD1-49BA3B81197B} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {EF766CAE-9CB1-494C-9153-0030631A6340} = {60F87E25-BC87-4782-8E20-1621AAEBB113} + {F0A9637C-20DA-42F0-83D4-23B4704DE602} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {7FE16CBE-B717-45C9-97FB-FA3191039568} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {7B7A51ED-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {CDA96AA3-3252-4978-A0BF-2ACD670823CB} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {86883653-8A61-4038-81A0-2379FAE4200A} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {7B7A563D-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94} + {82125DA1-1CD7-45B5-9281-E6AE7C287CB7} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {731312A8-6DA3-4841-AFCD-57520BA1BF8E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {CC8DDDCB-D53A-4B30-8596-AEF1C493DB31} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {9F999212-AFC5-4EAC-AA78-F7247D46C456} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {05E45AF7-C069-4057-BC16-0A532D068CE4} + {1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {05E45AF7-C069-4057-BC16-0A532D068CE4} + {CD721536-CFD3-413E-A3D7-FB0FAF989635} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {16F14058-B116-49D9-8BA0-209F3AFFE849} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {EC7157E9-A51F-4702-A5FD-8DAF88C7029F} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} + {5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4} + {277EBD9D-2504-49FA-AC72-59D5515130C3} = {1526F027-B007-472D-82E2-5A91340F3B62} + {50EF9EE6-5018-453E-A063-F77044EF1A97} = {1526F027-B007-472D-82E2-5A91340F3B62} + {4B442D34-641A-4B37-9A4B-D18DBE28A979} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + {3500A847-E024-4E7D-92DD-CC587C17460B} = {05E45AF7-C069-4057-BC16-0A532D068CE4} + {D771A06D-CC25-4582-B5CD-D2A4782BB005} = {05E45AF7-C069-4057-BC16-0A532D068CE4} + {743FC7AA-3884-4C96-983A-A33FD6C56227} = {43ED3FD0-824C-4201-BD96-B824DF959ADC} + {40A8CC31-8C08-4156-AE08-E8C0FADC3509} = {7A27E076-296E-41A8-BA76-164071251372} + {292FF4EE-D9DD-4BA7-85F7-6A22148D1E01} = {47755F2E-D674-4175-9E38-8EA053455072} + {4CF94A50-0D17-432A-8B5A-8458E91C44A6} = {7A27E076-296E-41A8-BA76-164071251372} + {5D1972FA-F0A4-4035-8E63-8BAEF0230097} = {F37067BD-8BB1-4F93-AEF4-F37434613AE4} + {65649688-3377-4FA9-8CD0-BDC3AC72E0AD} = {3BF56127-6F0F-41CF-BFCE-31165A0A5E73} {D5CB8825-0D1F-4940-9906-9BD87614B24E} = {05E45AF7-C069-4057-BC16-0A532D068CE4} {EA6DC625-7AD7-44A8-BDE9-4620D01B3AA5} = {05E45AF7-C069-4057-BC16-0A532D068CE4} {C5E944BA-A7C4-482F-BE01-077A7DFC159C} = {05E45AF7-C069-4057-BC16-0A532D068CE4} {B6DED59B-B52A-4D44-9B61-26FF0382764A} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE} - {B3B46744-DBB5-42C2-BAD7-9151D9486045} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5} {0DF2109B-BB85-4718-82DE-1C0536D4F2C3} = {B3B46744-DBB5-42C2-BAD7-9151D9486045} - {3631994A-59E6-4CD6-99A4-6D332F8DABE2} = {3BF56127-6F0F-41CF-BFCE-31165A0A5E73} {91EA9F28-B9B6-4FC7-A47D-9838F5915700} = {3631994A-59E6-4CD6-99A4-6D332F8DABE2} + {2ECE5AEB-F471-4A1D-9BAD-963D5C8A8A1D} = {DD043083-71A4-409A-AA91-F9C548DCF7EC} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9FF5B559-FC1B-4694-963F-355905287887} EndGlobalSection EndGlobal diff --git a/Documentation/current_iteration.md b/Documentation/current_iteration.md index 0ad3e8e2c93..a719d107612 100644 --- a/Documentation/current_iteration.md +++ b/Documentation/current_iteration.md @@ -1,104 +1,24 @@ - -# CNTK v2.3 Release Notes +# CNTK Current Iteration -## Highlights of this Release -- Better ONNX support. -- Switching to NCCL2 for better performance in distributed training. -- Improved C# API. -- OpenCV is not required to install CNTK, it is only required for Tensorboard Image feature and image reader. -- Various performance improvement. -- Add Network Optimization API. -- Faster Adadelta for sparse. +## Change profiler details output format to be chrome://tracing -## API -### C# -- Improved C# API with performance gains in training and evaluation. -- During training and evaluation, data batch can be created from single managed buffer with offset. This eases the burden to prepare data in C# code. -- Internally, data marshalling is done more efficiently than Release 2.2. Use of chatty FloatVector has been avoided during training and evaluation. -### C++ -- Exported “PreorderTraverse” C++ API: use to search the graph based on the provided criteria. -### Python and C++ -- Add custom attributes to primitive function, which would be serialized/deserialized when model save/load. -- Some usage: -```python - func = C.plus(a, b) - func.custom_attributes = {'test':'abc', 'dict':{'a':1, 'b':2}, 'list':[1,2,3]} - func.custom_attributes['test2'] = 'def' +## Enable per-node timing. Working example [here](../Examples/Image/Classification/MLP/Python/SimpleMNIST.py) +- per-node timing creates items in profiler details when profiler is enabled. +- usage in Python: ``` -### Enabled data unit in frequency specification (Python) -Now we can specify data unit in sample, minibatch and sweep in training session progress frequency, CrossValidationConfig, and Checkpoint Config. For example, -```python - C.training_session( - trainer=t, - mb_source=mbs, - mb_size=C.minibatch_size_schedule(4), - model_inputs_to_streams=input_map, - max_samples=60, - progress_frequency=(5, C.train.DataUnit.minibatch), - checkpoint_config = C.CheckpointConfig(frequency=(1, C.train.DataUnit.sweep), preserve_all=True, - filename=str(tmpdir / "checkpoint_save_all")), - cv_config = C.CrossValidationConfig(mbs1, frequency=(100, C.train.DataUnit.sample), minibatch_size=32), - ).train(device) +import cntk as C +C.debugging.debug.set_node_timing(True) +C.debugging.start_profiler() # optional +C.debugging.enable_profiler() # optional +# executions +.print_node_timing() +C.debugging.stop_profiler() ``` -For details, see: -- [training_session]( https://cntk.ai/pythondocs/cntk.train.training_session.html?highlight=training%20session#module-cntk.train.training_session) -- [CrossValidationConfig](https://cntk.ai/pythondocs/cntk.train.training_session.html?highlight=crossvalidationconfig#cntk.train.training_session.CrossValidationConfig) -- [CheckPointConfig](https://cntk.ai/pythondocs/cntk.train.training_session.html?highlight=checkpointconfig#cntk.train.training_session.CheckpointConfig) -If no data unit is specified, the default data unit is in samples. +## CPU inference performance improvements using MKL +- Accelerates some common tensor ops in Intel CPU inference for float32, especially for fully connected networks +- Can be turned on/off by cntk.cntk_py.enable_cpueval_optimization()/cntk.cntk_py.disable_cpueval_optimization() -### Netopt Module – Network Optimizations for faster Inferences -- In recent years, the DNN Research community has proposed many techniques to make inference faster and more compact. Proposed techniques include factoring matrix-vector-product and convolution operations, binarization/quantization, sparsification and the use of frequency-domain representations. -- The goal of cntk.contrib.netopt module is to provide users of CNTK easy-to-use interfaces to speed up or compress their networks using such optimizations, and writers of optimizations a framework within which to export them to CNTK users. -- The initial release of netopt supports factoring of Dense CNTK layers and the 1-bit binarization of Convolutional layers. -#### Netopt API -- Details on how to use the netopt module is available in [Manual_How_to_use_network_optimizations.ipynb](https://github.com/Microsoft/CNTK/tree/release/2.3/Manual/Manual_How_to_use_network_optimizations.ipynb) - -## Operators -### Group convolution -- We added support for group convolution on the GPU, exposed by C++ and Python API. -### Free static axes (FreeDimension) support for more operators -- We have added free static axes support for additional operators such as pooling (MaxPool, AveragePool), global pooling, unpooling, and reshape. With this increased support, it should be possible to run most common convolutional pipelines (CNNs) with free static axes. -### Backcompat -- Support loading v1 model with DiagTimes node. - -## Performance -### Convolution with free static axes support -- We have improved the training performance for models that use convolution operation with free static axes support. For certain models, we see training speed up of more than x5. -### Validation Performance -- Improve validation performance and remove a lot of unneeded validation check. -### CPU Convolution -- Move CPU Convolution to use MKL-ML, which leads to ~4x speedup in AlexNet training. -### Moving to NCCL2 -- NCCL2 would be enabled by default in official CNTK releases for Linux GPU build, which reduced aggregation cost in distributed training. For Python users, there’s no impact as NCCL binary is included in the Linux Python wheels. For BrainScript users on Linux, they need to install [NCCL library]( https://github.com/NVIDIA/nccl) as part of CNTK environment setup, similar to CUDA and CUDNN. CPU builds and Windows builds are not affected since NCCL is available for Linux only. -### Adadelta -- Faster adadelta updates when gradients are sparse. The running time for the update is now proportional to the number of _non-zero_ elements in the gradient. We observed a speedup of 5x on a single GPU for a feed forward model with a high dimensional sparse input (about 2 million features). Memory requirements increased modestly, requiring 4 additional bytes per sparse input feature (about 8 MB for the aforementioned network). - -## ONNX -- Improved ONNX support in CNTK. -- Update ONNX to the latest ONNX from https://github.com/onnx/onnx -- Covers most vision models such as Resnet, Inception, and VGG (only model saved in V2 CNTK format). -- Fixed several bugs. - -## Dependencies -### Removed OpenCV dependency from CNTK core. -- CNTK 2.2 requires you to install OpenCV to use CNTK but it is optional for CNTK 2.3 -- You need to install OpenCV only if you are planning to use ImageReader or TensorBoard’s Image feature. -### Upgraded ImageIO to 2.2.0 -- [Details](https://github.com/Microsoft/CNTK/pull/2385) -### MKL -- Switched from CNTKCustomMKL to Intel MKLML. MKLML is released with [Intel MKL-DNN](https://github.com/01org/mkl-dnn/releases) as a trimmed version of Intel MKL for MKL-DNN. To set it up: - -#### On Linux: - sudo mkdir /usr/local/mklml - sudo wget https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz - sudo tar -xzf mklml_lnx_2018.0.1.20171007.tgz -C /usr/local/mklml - -#### On Windows: - Create a directory on your machine to hold MKLML, e.g. mkdir c:\local\mklml - Download the file [mklml_win_2018.0.1.20171007.zip](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_win_2018.0.1.20171007.zip). - Unzip it into your MKLML path, creating a versioned sub directory within. - Set the environment variable `MKLML_PATH` to the versioned sub directory, e.g. setx MKLML_PATH c:\local\mklml\mklml_win_2018.0.1.20171007 - -## Warning -### Support for Python 3.4 will be removed from CNTK releases later than v2.3. +## Bug fixes +- Fixed convergence issue in Tutorial 201B +- Fixed pooling/unpooling to support free dimension \ No newline at end of file diff --git a/Examples/Evaluation/CNTKAzureTutorial01/CNTKAzureTutorial01/CNTKAzureTutorial01.csproj b/Examples/Evaluation/CNTKAzureTutorial01/CNTKAzureTutorial01/CNTKAzureTutorial01.csproj index 5fbcf1cd6fd..fd78a979a6b 100644 --- a/Examples/Evaluation/CNTKAzureTutorial01/CNTKAzureTutorial01/CNTKAzureTutorial01.csproj +++ b/Examples/Evaluation/CNTKAzureTutorial01/CNTKAzureTutorial01/CNTKAzureTutorial01.csproj @@ -23,8 +23,8 @@ - - ..\packages\CNTK.CPUOnly.2.3.1\lib\net45\x64\Cntk.Core.Managed-2.3.1.dll + + ..\packages\CNTK.CPUOnly.2.4.0\lib\net45\x64\Cntk.Core.Managed-2.4.dll True @@ -148,12 +148,16 @@ - + + + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - + + + %s%s\n", node->FormatOperationPrototype("").c_str(), dumpGradient ? " Grad" : ""); node->WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX, SIZE_MAX, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/false, std::vector(), - ""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/, - "%13.10f"/*valueFormatString*/, dumpGradient, concise); + ""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/, + "%13.10f"/*valueFormatString*/, dumpGradient, concise); return true; } +// helper for logging. Returns false if it was not able to dump +static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient) +{ + let nodef = dynamic_pointer_cast>(nodep); + if (nodef) return TypedDumpNode(nodef, dumpGradient); + let noded = dynamic_pointer_cast>(nodep); + if (noded) return TypedDumpNode(noded, dumpGradient); + let nodeh = dynamic_pointer_cast>(nodep); + if (nodeh) return TypedDumpNode(nodeh, dumpGradient); + return false; +} + // ----------------------------------------------------------------------- // SEQTraversalFlowControlNode methods -- implements SEQ traversal (loop unrolling) // @@ -270,7 +284,9 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient) { for (auto& node : m_nestedNodes) { + node->BeginTiming(false /*backward*/); node->ForwardProp(t); + node->EndTiming(false /*backward*/); node->BumpEvalTimeStamp(); } } @@ -280,7 +296,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient) { if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode()) { - DumpNode(node, /*dumpGradient=*/false) || DumpNode(node, false); + DumpNode(node, /*dumpGradient=*/false); } } } @@ -310,7 +326,9 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient) for (auto nodeIter2 = recurrentNodes.rbegin(); nodeIter2 != recurrentNodes.rend(); ++nodeIter2) { auto& node2 = *nodeIter2; + node2->BeginTiming(true /*backward*/); node2->Backprop(t, true /*childrenInThisLoop*/, false /*childrenInOuterLoop*/); + node2->EndTiming(true /*backward*/); // The above flags tell Backprop() to skip back-propagation from inside a node into // a node that is outside the loop, which is done later in EndBackprop() in PAR mode. } @@ -321,7 +339,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient) { if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient()) { - DumpNode(node, /*dumpGradient=*/true) || DumpNode(node, true); + DumpNode(node, /*dumpGradient=*/true); } } } diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj index c2f5b7b3e2d..40fc853e030 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj +++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -52,7 +52,7 @@ - $(SolutionDir)Source\SequenceTrainingLib;$(BOOST_INCLUDE_PATH);$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\CNTKv2LibraryDll;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude) + $(SolutionDir)Source\SequenceTrainingLib;$(BOOST_INCLUDE_PATH);$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\CNTKv2LibraryDll;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude);$(SolutionDir)Source\PerformanceProfilerDll; WIN32;_CONSOLE;%(PreprocessorDefinitions) diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index d1057dda6ac..7d6ee20234c 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -10,13 +10,21 @@ #include "ComputationNetworkBuilder.h" // TODO: We should only pull in NewComputationNodeFromConfig(). Nodes should not know about network at large. #include "TensorShape.h" +#ifndef CNTK_UWP +#include "PerformanceProfiler.h" +#ifdef _WIN32 +#define PERFORMANCE_PROFILER_LIB_NAME "Cntk.PerformanceProfiler-"##CNTK_COMPONENT_VERSION##".lib" +#pragma comment(lib, PERFORMANCE_PROFILER_LIB_NAME) +#endif +#endif + #ifndef let #define let const auto #endif namespace Microsoft { namespace MSR { namespace CNTK { -using namespace std; + using namespace std; // ----------------------------------------------------------------------- // subroutines for evaluation @@ -85,18 +93,18 @@ void ComputationNode::Backprop(const FrameRange& fr, bool childrenInTh for (size_t i = 0; i < m_inputs.size(); i++) { - ComputationNodePtr child = Input(i); - if (child->m_needsGradient && + ComputationNodeBasePtr child = m_inputs[i]; + if (child->NeedsGradient() && ((childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop()) || (childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()) )) { // fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str()); - if (!m_needsGradient) + if (!NeedsGradient()) LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str()); #if DUMPOUTPUT fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str()); #endif - child->LazyZeroGradient(this); // set gradient to 0 if this is the first time + SMART_NODE_INVOKE(ComputationNode, child, LazyZeroGradient, this); // set gradient to 0 if this is the first time // If we propagate from a loop to a node that is outside the loop, we are not efficient. // This case is handled by SEQTraversalFlowControlNode::Backprop(). @@ -108,7 +116,7 @@ void ComputationNode::Backprop(const FrameRange& fr, bool childrenInTh } // before backprop, verify gradient optimization info - Input(i)->VerifyGradientOptimization(this); + SMART_NODE_INVOKE(ComputationNode, child, VerifyGradientOptimization, this); // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str()); BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child @@ -753,11 +761,14 @@ template { for (size_t i = 0; i < m_inputs.size(); i++) { - ComputationNodePtr child = Input(i); - if (child->m_needsGradient) + ComputationNodeBasePtr child = m_inputs[i]; + if (child->NeedsGradient()) { - child->MaskMissingGradientColumnsToZero(FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0 - if (child->Gradient().HasNan("EndBackprop")) + SMART_NODE_INVOKE(ComputationNode, child, MaskMissingGradientColumnsToZero, FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0 + + bool hasNan = false; + SMART_NODE_INVOKE_WITH_RET(ComputationNode, child, Gradient().HasNan, hasNan, "EndBackprop"); + if (hasNan) { LogicError("%ls %ls operation unexpectedly produced NaN gradients on its input %ls.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str()); } @@ -766,6 +777,67 @@ template } } +template +/*virtual*/ void ComputationNode::BeginTiming(bool backward) +{ + if (!Globals::ShouldEnableNodeTiming()) return; + + int phase = (backward ? (int)TimingPhase_Backward : (int)TimingPhase_Forward); + auto& timing = m_timing[phase]; + timing.beginTime = std::chrono::system_clock::now(); + timing.count++; +#ifndef CNTK_UWP + timing.profilerId = ProfilerTimeBegin(); +#endif +} + +template +/*virtual*/ void ComputationNode::EndTiming(bool backward) +{ + if (!Globals::ShouldEnableNodeTiming()) return; + + int phase = (backward ? (int)TimingPhase_Backward : (int)TimingPhase_Forward); + auto& timing = m_timing[phase]; + timing.duration += (std::chrono::system_clock::now() - timing.beginTime); + +#ifndef CNTK_UWP + // the order must match enum + static const char* postfixes[TimingPhase_Total] = + { + "Forward", + "Backward", + }; + + if (timing.profilerName.length() != m_nodeName.length() + strlen(postfixes[phase])) + { + static char name[256]; + sprintf_s(name, _countof(name), "%S%s", m_nodeName.c_str(), postfixes[phase]); + timing.profilerName = name; + } + ProfilerTimeEnd(timing.profilerId, timing.profilerName.c_str()); +#endif +} + +template +void ComputationNode::PrintForwardBackwardTime() +{ + if (GetInputs().size() == 0) return; + + auto& forwardCount = m_timing[TimingPhase_Forward].count; + auto forwardDuration = m_timing[TimingPhase_Forward].duration.count(); + auto& backwardCount = m_timing[TimingPhase_Backward].count; + auto backwardDuration = m_timing[TimingPhase_Backward].duration.count(); + fprintf(stderr, "%-30S forward avg %07fs, backward avg %07fs (fwd# %d|bwd# %d)\n", + m_nodeName.c_str(), + forwardCount == 0 ? 0 : forwardDuration / forwardCount, + backwardCount == 0 ? 0 : backwardDuration / backwardCount, + forwardCount, + backwardCount); + + for (auto& timing : m_timing) + timing.Reset(); +} + template /*virtual*/ void ComputationNode::DumpNodeInfo(const bool /*printValues*/, const bool printMetadata, File& fstream) const { @@ -1004,7 +1076,7 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, double absSumLocal = 0; for (size_t j = 0; j < jend; j++) // loop over elements { - absSumLocal += abs(seqData[i * istride + j * jstride]); + absSumLocal += (double)abs(seqData[i * istride + j * jstride]); } absSum += absSumLocal; } @@ -1136,6 +1208,7 @@ atomic_ullong TimeStamp::s_timeStampCounter = ATOMIC_VAR_INIT(0); template <> map>> ComputationNode::s_constOnes{}; template <> map>> ComputationNode::s_constOnes{}; +template <> map>> ComputationNode::s_constOnes{}; // ----------------------------------------------------------------------- // instantiate the core class templates @@ -1143,6 +1216,7 @@ template <> map>> ComputationNode; template class ComputationNode; +template class ComputationNode; }}} diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 44aef1ca58e..45aa444ba26 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -59,7 +59,9 @@ #define CNTK_MODEL_VERSION_27 27 // Slice: support stride_multiplier, and to_batch / unpack_bach axis ops; // Reduction: Add reduction over multiple axes #define CNTK_MODEL_VERSION_28 28 // Padding op -#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_28 +#define CNTK_MODEL_VERSION_29 29 // Expose StopGradient in BS +#define CNTK_MODEL_VERSION_30 30 // LatticeWithSequenceSoftmax node +#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_30 // helper mode for debugging // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations. @@ -100,6 +102,9 @@ struct /*interface*/ IComputationNode virtual void ForwardProp(const FrameRange&) = 0; // forward prop for one minibatch virtual void EndForwardProp() = 0; // called after last iteration step of ForwardProp() + virtual void BeginTiming(bool backward) = 0; // called before Forward/Backward for node timing + virtual void EndTiming(bool backward) = 0; // called after Foward/Backward for node timing + virtual void PostForwardAndBackProp() {} // Optional: Post forward and backprop prop for one minibatch, this will be called in a second // looping on the graph, after the backward pass finish. Or after forward pass in inference // mode. @@ -785,6 +790,9 @@ protected: public: // ...the following should be protected, but nodes inquire ab #endif } + virtual void /*IComputationNode::*/ BeginTiming(bool) override {} + virtual void /*IComputationNode::*/ EndTiming(bool) override {} + // check whether a node is out of date w.r.t. its children, for lazy evaluation // If this returns true, node must be evaluated to update m_value. // This is virtual because it is overridden by traversal nodes, which would check all their nodes' inputs. @@ -1373,7 +1381,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot m_inputs.resize(inputs.size()); for (size_t i = 0; i < m_inputs.size(); i++) if (inputs[i]) - m_inputs[i] = DownCast(inputs[i]); // (DownCast() checks the type; the assignment then downcasts it again) + m_inputs[i] = inputs[i]; // remove DownCast check here to allow CastNode to pass else m_inputs[i] = nullptr; // during network creation, nullptrs are possible @@ -1385,6 +1393,8 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot } } + void PrintForwardBackwardTime(); + protected: // AttachInputs() from config @@ -1428,6 +1438,19 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot return DownCast(m_inputs[inputIndex]); } + template + inline shared_ptr> TypedInput(const size_t inputIndex) const + { + if (inputIndex >= m_inputs.size()) + LogicError("Inputs: inputIndex %d is out of range for %ls %ls operation.", (int)inputIndex, NodeName().c_str(), OperationName().c_str()); + + shared_ptr> node = dynamic_pointer_cast>(m_inputs[inputIndex]); + if (!node) + InvalidArgument("an TypedInput of mismatching precision was passed"); + + return node; + } + // Fast downcast without runtime type check of dynamic_pointer_cast. // Meant to be used in Forward and BackPropTo, assuming that Validate() has already used Input() which validated the correct types. inline ComputationNode& InputRef(const size_t inputIndex) const @@ -1761,6 +1784,10 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot virtual void /*IComputationNode::*/ EndBackprop() override; + virtual void /*IComputationNode::*/ BeginTiming(bool) override; + + virtual void /*IComputationNode::*/ EndTiming(bool) override; + // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation // TODO: move to -Base (or -Network?) void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override; @@ -1927,24 +1954,36 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot // if the matrix's size will scale with minibatch size, set mbScale = true // if workspace flag is true, the memory request will be treated specially. We assume workspace memory will share their own pointers // this is currently a workaround for workspace memory for convolutions - void RequestMatrixFromPool(shared_ptr>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false) + template + void TypedRequestMatrixFromPool(shared_ptr>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false) { if (matrixPtr == nullptr) { if (aliasing) - matrixPool.RequestAliasedAllocate(m_deviceId, this, &matrixPtr, matrixSize, mbScale); + matrixPool.RequestAliasedAllocate(m_deviceId, this, &matrixPtr, matrixSize, mbScale); else - matrixPool.RequestAllocate(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace); + matrixPool.RequestAllocate(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace); } } - void ReleaseMatrixToPool(shared_ptr>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false) + template + void TypedReleaseMatrixToPool(shared_ptr>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false) { assert(matrixPtr != nullptr); if (aliasing) - matrixPool.RequestAliasedRelease(this); + matrixPool.RequestAliasedRelease(this); else - matrixPool.RequestRelease(&matrixPtr); + matrixPool.RequestRelease(&matrixPtr); + } + + void RequestMatrixFromPool(shared_ptr>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize = 0, bool mbScale = false, bool isWorkSpace = false, bool aliasing = false) + { + TypedRequestMatrixFromPool(matrixPtr, matrixPool, matrixSize, mbScale, isWorkSpace, aliasing); + } + + void ReleaseMatrixToPool(shared_ptr>& matrixPtr, MatrixPool& matrixPool, bool aliasing = false) + { + TypedReleaseMatrixToPool(matrixPtr, matrixPool, aliasing); } public: @@ -2112,7 +2151,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot s_constOnes[rows].find(cols) == s_constOnes[rows].end()) // not found { shared_ptr> matrix = make_shared>(rows, cols, (DEVICEID_TYPE) deviceId); - matrix->SetValue(1); + matrix->SetValue((ElemType)1); s_constOnes[rows][cols] = matrix; } @@ -2133,6 +2172,28 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot static std::map>>> s_constOnes; MatrixType m_preferredGradientMatrixType = UNDETERMINED; + + enum TimingPhase + { + TimingPhase_Forward = 0, + TimingPhase_Backward, + TimingPhase_Total + }; + + struct Timing + { + std::chrono::system_clock::time_point beginTime; + int count = 0; + std::chrono::duration duration = std::chrono::duration(0); + long long profilerId; + std::string profilerName; + + void Reset() + { + duration = std::chrono::duration(0); + count = 0; + } + } m_timing[TimingPhase_Total]; }; // convenience wrapper for ComputationNode::New() @@ -2514,4 +2575,26 @@ class BinaryElementWiseNode : public ComputationNode, public NumInputs #pragma endregion base computation class +#define SMART_NODE_INVOKE(nodeClass, node, func, ...) \ + do { \ + if (dynamic_pointer_cast>(node)) \ + dynamic_pointer_cast>(node)->func(__VA_ARGS__); \ + else if (dynamic_pointer_cast>(node)) \ + dynamic_pointer_cast>(node)->func(__VA_ARGS__); \ + else if (dynamic_pointer_cast>(node)) \ + dynamic_pointer_cast>(node)->func(__VA_ARGS__); \ + else \ + LogicError("Unknown nodeClass type"); \ + } while(0) + +#define SMART_NODE_INVOKE_WITH_RET(nodeClass, node, func, ret, ...) \ + do { \ + if (dynamic_pointer_cast>(node)) \ + ret = dynamic_pointer_cast>(node)->func(__VA_ARGS__); \ + else if (dynamic_pointer_cast>(node)) \ + ret = dynamic_pointer_cast>(node)->func(__VA_ARGS__); \ + else if (dynamic_pointer_cast>(node)) \ + ret = dynamic_pointer_cast>(node)->func(__VA_ARGS__); \ + else LogicError("Unknown ComputationNode type"); \ + } while(0) }}} diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index 6c75a3082c9..32747da4165 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -244,6 +244,13 @@ class ConvolutionNodeBase : public ComputationNode return result; } + TensorShape ComputeOutputShape(const TensorShape& inputShape, const TensorShape& dilate, bool ceilOutDim, bool isFinalValidationPass) + { + return ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride, + m_sharing, m_autoPad, m_lowerPad, m_upperPad, dilate, ceilOutDim, + Base::NeedsDynamicValidation(), isFinalValidationPass); + } + protected: TensorShape m_kernelShape; TensorShape m_mapCount; @@ -280,7 +287,7 @@ protected: \ using Base::m_transpose; \ using Base::m_outputShape; \ using Base::m_ceilOutDim; \ - using Base::m_poolIncludePad; \ + using Base::m_poolIncludePad; \ using Base::m_imageLayout; \ using Base::m_maxTempMemSizeInSamples; \ using Base::m_tempMatrixForward; \ @@ -493,8 +500,7 @@ class ConvolutionNode : public ConvolutionNodeBase, public NumInputs<2 Input(0)->NodeName().c_str(), (int)mapCount, (int)weightCols); } - outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride, - m_sharing, m_autoPad, m_lowerPad, m_upperPad); + outputShape = this->ComputeOutputShape(inputShape, TensorShape(1), /*ceilOutDim*/false, isFinalValidationPass); // ConvolveGeometry always uses CHW. SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout()); } @@ -505,9 +511,7 @@ class ConvolutionNode : public ConvolutionNodeBase, public NumInputs<2 InferReductionDims(inputShape, inputShape); if (!m_transpose) { - outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride, - m_sharing, m_autoPad, m_lowerPad, m_upperPad, m_dilation, false, - this->NeedsDynamicValidation(), isFinalValidationPass); + outputShape = this->ComputeOutputShape(inputShape, m_dilation, /*ceilOutDim*/false, isFinalValidationPass); if (m_outputShape.GetRank() > 0 && m_outputShape != TensorShape(0)) // user have explicitly set m_outputShape, we check if it's the same as outputShape { @@ -529,15 +533,13 @@ class ConvolutionNode : public ConvolutionNodeBase, public NumInputs<2 // and node output (outDims) is convolution input. ConvolveGeometry does not care about deconvolutions (it does not have to). outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad, TensorShape(1), false, - this->NeedsDynamicValidation(), isFinalValidationPass); + Base::NeedsDynamicValidation(), isFinalValidationPass); } else { // in case the user specifies the output shape, we make sure the input shape can be the result of // convolution from the specified output shape - auto inferredShape = ConvolveGeometry::ComputeOutputShape(m_outputShape, m_kernelShape, m_mapCount, m_stride, - m_sharing, m_autoPad, m_lowerPad, m_upperPad, TensorShape(1), false, - this->NeedsDynamicValidation(), isFinalValidationPass); + auto inferredShape = this->ComputeOutputShape(m_outputShape, TensorShape(1), false, isFinalValidationPass); if (inputShape != inferredShape) InvalidArgument("%ls %ls the shape of the convolution transpose operand %ls is different from " "the result of convoluting the specified output argument using " @@ -954,8 +956,7 @@ class PoolingNode : public ConvolutionNodeBase, public NumInputs<1>, p // infer reduction dimensions if not given InferReductionDims(inputShape, TensorShape()); - auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride, - m_sharing, m_autoPad, m_lowerPad, m_upperPad, TensorShape(1), m_ceilOutDim); + auto outDims = this->ComputeOutputShape(inputShape, TensorShape(1), m_ceilOutDim, isFinalValidationPass); SetDims(outDims, HasMBLayout()); if (isFinalValidationPass) { @@ -1078,8 +1079,7 @@ class MaxUnpoolingNode : public ConvolutionNodeBase, public NumInputs< // Same as in case of deconvolution, node input (inputShape) is really the output of the max pooling // and node output (outDims) is pooling input. auto outputShape = GetInputSampleLayout(1); - auto inferredShape = ConvolveGeometry::ComputeOutputShape(outputShape, m_kernelShape, m_mapCount, m_stride, - m_sharing, m_autoPad, m_lowerPad, m_upperPad); + auto inferredShape = this->ComputeOutputShape(outputShape, TensorShape(1), false, isFinalValidationPass); if (inputShape != inferredShape) InvalidArgument("%ls %ls the shape of the unpooling operand %ls is different from " "the result of pooling the poolingInput argument using" diff --git a/Source/ComputationNetworkLib/EvaluationNodes.h b/Source/ComputationNetworkLib/EvaluationNodes.h index 27031941090..8431eb95491 100644 --- a/Source/ComputationNetworkLib/EvaluationNodes.h +++ b/Source/ComputationNetworkLib/EvaluationNodes.h @@ -873,6 +873,7 @@ class OneHotNode : public ComputationNodeNonLooping, public NumInputs< template class OneHotNode; template class OneHotNode; +template class OneHotNode; #ifdef COMING_SOON diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.cpp b/Source/ComputationNetworkLib/InputAndParamNodes.cpp index a04e5cc182e..821fa7b8ffc 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp +++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp @@ -104,7 +104,7 @@ LearnableParameter::LearnableParameter(const ScriptableObjects::IConfi static unsigned long randomSeed = 1; int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed; - m_initValueScale = configp->Get(L"initValueScale"); + m_initValueScale = (ElemType)(float)configp->Get(L"initValueScale"); m_initFilterRank = configp->Get(L"initFilterRank"); m_initOutputRank = configp->Get(L"initOutputRank"); m_initOnCPUOnly = configp->Get(L"initOnCPUOnly"); @@ -112,12 +112,12 @@ LearnableParameter::LearnableParameter(const ScriptableObjects::IConfi else if (initString == L"zero") { m_initString = L"fromValue"; - m_initValue = 0; + m_initValue = (ElemType)0; } else if (initString == L"fromValue") // from 'initValue' { m_initString = initString; - m_initValue = initValue; + m_initValue = (ElemType)(float)initValue; } else if (initString == L"bilinear") { @@ -138,7 +138,7 @@ LearnableParameter::LearnableParameter(const ScriptableObjects::IConfi else if (initString == L"fixedValue") // deprecated. Use initValue=... instead { m_initString = L"fromValue"; - m_initValue = (ElemType)configp->Get(L"value"); + m_initValue = (ElemType)(float)configp->Get(L"value"); } else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead { @@ -549,7 +549,7 @@ void LearnableParameter::LazyInitParameters() if (m_initString == L"fromValue") { if (GetEnvironmentPtr() && Environment().traceLevel > 0) // note: this will not log before node is part of network - fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue); + fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), (float)m_initValue); Value().SetValue(m_initValue); } else if (ParseRandomizationType(m_initString).second != 0) @@ -651,5 +651,6 @@ template template class LearnableParameter; template class LearnableParameter; +template class LearnableParameter; }}} diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 0ae96a60c39..84642fe7227 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -108,7 +108,7 @@ class LearnableParameter : public ComputationNode, public NumInputs<0> { fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f(%f*%f), onCPU=%s.\n)", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), - (int)randomSeed, (int)fanOut, (int)fanIn, range, range/initValueScale, initValueScale, initOnCPUOnly ? "true" : "false"); + (int)randomSeed, (int)fanOut, (int)fanIn, (float)range, (float)(range/initValueScale), (float)(initValueScale), initOnCPUOnly ? "true" : "false"); } } diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.cpp b/Source/ComputationNetworkLib/LinearAlgebraNodes.cpp index bfe76624a84..6db28c4a74c 100644 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.cpp +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.cpp @@ -41,6 +41,9 @@ template void Microsoft::MSR::CNTK::UpdateRunningAverage(ComputationNode< template void Microsoft::MSR::CNTK::UpdateRunningAverage(ComputationNode& newInput, TensorView& runningAverage, size_t& runningCount); +template void Microsoft::MSR::CNTK::UpdateRunningAverage(ComputationNode& newInput, + TensorView& runningAverage, + size_t& runningCount); template EpochAccumulatorNode::EpochAccumulatorNode(DEVICEID_TYPE deviceId, const wstring& name) @@ -127,4 +130,5 @@ void EpochAccumulatorNode::Reset() } template class EpochAccumulatorNode; -template class EpochAccumulatorNode; \ No newline at end of file +template class EpochAccumulatorNode; +template class EpochAccumulatorNode; \ No newline at end of file diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h index 7d464b95aff..14b4133a6d1 100755 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h @@ -91,6 +91,7 @@ class PlusNode : public BinaryElementWiseNode template class PlusNode; template class PlusNode; +template class PlusNode; // ----------------------------------------------------------------------- // LogPlusNode (summand1, summand2) @@ -165,6 +166,7 @@ class LogPlusNode : public BinaryElementWiseNode template class LogPlusNode; template class LogPlusNode; +template class LogPlusNode; // ----------------------------------------------------------------------- @@ -225,6 +227,7 @@ class PowNode : public BinaryElementWiseNode template class PowNode; template class PowNode; +template class PowNode; // ----------------------------------------------------------------------- @@ -287,6 +290,7 @@ class MinusNode : public BinaryElementWiseNode template class MinusNode; template class MinusNode; +template class MinusNode; // ----------------------------------------------------------------------- // ElementTimesNode (factor1, factor2) @@ -360,6 +364,7 @@ class ElementTimesNode : public BinaryElementWiseNode template class ElementTimesNode; template class ElementTimesNode; +template class ElementTimesNode; // ----------------------------------------------------------------------- // TimesNodeBase (A, B, outputRank=1) @@ -439,6 +444,19 @@ class TimesNodeBase : public ComputationNode, public NumInputs<2> return TensorView(data, tensorShape); } + static std::pair CalcOutputMatrixSize(const size_t leftRank , const size_t rightRank, const TensorShape& outShape) + { + size_t outRank = outShape.GetRank(); + size_t m = 1; + size_t n = 1; + size_t firstReducedDim = leftRank - (leftRank + rightRank - outRank) / 2; + for (size_t i = 0; i < firstReducedDim; i++) + m *= outShape.GetDim(i); + for (size_t i = firstReducedDim; i < outRank; i++) + n *= outShape.GetDim(i); + return std::make_pair(m, n); + } + private: // Check if TimesNodeBase could be simplified to ElementTimes to avoid unroll when: // 1. input0: is rank-1 and transposed, or is rank-2 with Dim(0)==1 @@ -593,7 +611,7 @@ class TimesNodeBase : public ComputationNode, public NumInputs<2> Matrix inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // k x s* inputValueSlice.Reshape(k * maxNumTimeSteps, 1); // (k * s*) x 1 Matrix gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1 - Matrix::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? 0 : beta, inputGradientSlice); + Matrix::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice); } if (unpacked[inputIndex]) @@ -611,7 +629,7 @@ class TimesNodeBase : public ComputationNode, public NumInputs<2> Matrix inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // (m * k) x s* inputValueSlice.Reshape(m, k * maxNumTimeSteps); // m x (k * s*) Matrix gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1 - Matrix::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? 0 : beta, inputGradientSlice); + Matrix::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice); } if (unpacked[inputIndex]) @@ -663,6 +681,16 @@ class TimesNodeBase : public ComputationNode, public NumInputs<2> return; } + if (fr.IsBatchMatmul(inputMBLayout) && fr.IsBatchMatmul(InputRef(1).GetMBLayout()) && !hasSparse) + { + auto mn = CalcOutputMatrixSize(InputRef(0).GetSampleLayout().GetRank(), InputRef(1).GetSampleLayout().GetRank(), GetSampleLayout()); + Matrix value = ValueFor(fr); + Matrix input0 = InputRef(0).ValueFor(fr); + Matrix input1 = InputRef(1).ValueFor(fr); + Matrix::BatchMatMul(ElemType(0.0), input0, m_transpose, mn.first, input1, false, mn.second, value, true); + return; + } + // recursively call ourselves for each individual time and sequence // note this is not performant, warn user about the slow path being used @@ -726,6 +754,45 @@ class TimesNodeBase : public ComputationNode, public NumInputs<2> return; } + ElemType beta = Input(inputIndex)->IsGradientInitializedBy(this) ? (ElemType)0.0 : (ElemType)1.0; + if (inputIndex == 0) + { + if (fr.IsBatchMatmul(InputRef(0).GetMBLayout()) && fr.IsBatchMatmul(InputRef(1).GetMBLayout()) && !hasSparse) + { + Matrix outputGradient = GradientFor(fr); + Matrix input1 = InputRef(1).ValueFor(fr); + Matrix input0Gradient = InputRef(0).GradientFor(fr); + if (!m_transpose) + { + auto mn = CalcOutputMatrixSize(GetSampleLayout().GetRank(), InputRef(1).GetSampleLayout().GetRank(), InputRef(0).GetSampleLayout()); + Matrix::BatchMatMul(beta, outputGradient, false, mn.first, input1, true, mn.second, input0Gradient, true); + } + else + { + auto mn = CalcOutputMatrixSize(InputRef(1).GetSampleLayout().GetRank(), GetSampleLayout().GetRank(), InputRef(0).GetSampleLayout()); + Matrix::BatchMatMul(beta, input1, false, mn.first, outputGradient, true, mn.second, input0Gradient, true); + } + return; + } + } + else if (inputIndex == 1) + { + if (fr.IsBatchMatmul(InputRef(0).GetMBLayout()) && fr.IsBatchMatmul(InputRef(1).GetMBLayout()) && !hasSparse) + { + auto mn = CalcOutputMatrixSize(InputRef(0).GetSampleLayout().GetRank(), GetSampleLayout().GetRank(), InputRef(1).GetSampleLayout()); + Matrix input0 = InputRef(0).ValueFor(fr); + Matrix input1Gradient = InputRef(1).GradientFor(fr); + Matrix outputGradient = GradientFor(fr); + + Matrix::BatchMatMul(beta, input0, !m_transpose, mn.first, outputGradient, false, mn.second, input1Gradient, true); + return; + } + } + + // note this is not performant, warn user about the slow path being used + if (Base::HasEnvironmentPtr() && Base::Environment().traceLevel > 0) + std::call_once(m_unrollWarningOnceFlag, [this] { fprintf(stderr, "WARNING: %ls %ls operation: being unrolled in backprop, execution may be slow\n", NodeName().c_str(), OperationName().c_str()); }); + auto timeRange = fr.GetTimeRange(); auto sequenceRange = fr.GetSequenceRange(); // when unroll, parent overwrite gradient should be ignored @@ -1051,6 +1118,7 @@ class TimesNode : public TimesNodeBase template class TimesNode; template class TimesNode; +template class TimesNode; // ----------------------------------------------------------------------- // TransposeTimesNode (A', B) @@ -1080,6 +1148,7 @@ class TransposeTimesNode : public TimesNodeBase template class TransposeTimesNode; template class TransposeTimesNode; +template class TransposeTimesNode; // Fixed-point matrix product. This scales inputs to 16bit signed integers by Symmetric quantizers, performs // integer multiplication using SSE/AVX2, and transforms the results back. @@ -1170,6 +1239,7 @@ class QuantizedTimesNode : public TimesNodeBase template class QuantizedTimesNode; template class QuantizedTimesNode; +template class QuantizedTimesNode; // ----------------------------------------------------------------------- // SumElementsNode (input) @@ -1405,6 +1475,7 @@ class TransposeDimensionsNode : public ComputationNode /*ComputationNode*/; template class TransposeDimensionsNode; +template class TransposeDimensionsNode; // ----------------------------------------------------------------------- // CosDistanceNode (left, right) @@ -1525,6 +1596,7 @@ class CosDistanceNode : public ComputationNode, public NumInputs<2> template class CosDistanceNode; template class CosDistanceNode; +template class CosDistanceNode; // ----------------------------------------------------------------------- // KhatriRaoProductNode (left, right) @@ -1851,6 +1923,7 @@ class CosDistanceWithNegativeSamplesNode : public ComputationNode, pub template class CosDistanceWithNegativeSamplesNode; template class CosDistanceWithNegativeSamplesNode; +template class CosDistanceWithNegativeSamplesNode; template void UpdateRunningAverage(ComputationNode& newInput, TensorView& runningAverage, @@ -1930,4 +2003,48 @@ class EpochAccumulatorNode : public ComputationNodeNonLooping, public size_t m_numSamples; }; +// ----------------------------------------------------------------------- +// CastNode converts data types from InputType to ElemType +// ----------------------------------------------------------------------- +template +class CastNode : public UnaryElementWiseNode +{ + typedef UnaryElementWiseNode Base; UsingUnaryElementwiseNodeBaseMembers; + static const std::wstring TypeName() { return L"Cast"; } + +public: + CastNode(DEVICEID_TYPE deviceId, const wstring& name) + : Base(deviceId, name) + { + } + + virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override + { + auto result = ValueFor(fr); + auto input = static_cast&>(*m_inputs[0].get()).ValueFor(fr); + result.CastAssignValuesOf(input); + } + + virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override + { + auto grad = GradientFor(fr); + auto inputGrad = static_cast&>(*m_inputs[0].get()).GradientFor(fr); + inputGrad.CastAssignValuesOf(grad); + } + + virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override + { + ValidateUnaryMap(isFinalValidationPass); + } + + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } +}; + +template class CastNode; +template class CastNode; +template class CastNode; +template class CastNode; +template class CastNode; +template class CastNode; }}} diff --git a/Source/ComputationNetworkLib/MatrixPool.h b/Source/ComputationNetworkLib/MatrixPool.h index 5c4ead2e925..cdf7c51d1ca 100644 --- a/Source/ComputationNetworkLib/MatrixPool.h +++ b/Source/ComputationNetworkLib/MatrixPool.h @@ -72,6 +72,7 @@ class MatrixPool protected: vector> m_memRequestInfoFloatVec; vector> m_memRequestInfoDoubleVec; + vector> m_memRequestInfoHalfVec; set m_deviceIDSet; int m_stepCounter; @@ -151,6 +152,7 @@ class MatrixPool // MatrixPool is not templated, so we call both float and double versions here OptimizedMemoryAllocationFunc(); OptimizedMemoryAllocationFunc(); + OptimizedMemoryAllocationFunc(); return; } diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h index e26b587b2a9..0f7edea527e 100644 --- a/Source/ComputationNetworkLib/NonlinearityNodes.h +++ b/Source/ComputationNetworkLib/NonlinearityNodes.h @@ -446,6 +446,7 @@ class HardmaxNode : public SoftmaxNodeBase /*ComputationNode*/ template class HardmaxNode; template class HardmaxNode; +template class HardmaxNode; @@ -517,7 +518,8 @@ class TopKNode : public ComputationNode, public MultiOutputNodeGetSampleLayout().GetDimPadded(0); auto tmp = new ElemType[numCols]; - std::generate(tmp, tmp + numCols, [i = ElemType(0), dim]() mutable { auto ret = i; i += dim; return ret; }); + ElemType i = ElemType(0); + std::generate(tmp, tmp + numCols, [&i, dim]() mutable { auto ret = i; i += dim; return ret; }); m_steps->SetValue(1, numCols, this->m_deviceId, tmp); delete[] tmp; m_sortedIndices->ScaleAndAdd(ElemType(1), *m_steps, *m_sortedIndices); @@ -768,7 +770,8 @@ public: \ }; \ \ template class ClassName; \ -template class ClassName; +template class ClassName; \ +template class ClassName; DefineComparisonNode(LessNode, -1, 0) DefineComparisonNode(EqualNode, 0, 0) diff --git a/Source/ComputationNetworkLib/RNNNodes.cpp b/Source/ComputationNetworkLib/RNNNodes.cpp index dac0b4640ef..0ffbf8a4d85 100644 --- a/Source/ComputationNetworkLib/RNNNodes.cpp +++ b/Source/ComputationNetworkLib/RNNNodes.cpp @@ -341,5 +341,6 @@ void OptimizedRNNStackNode::UnpackSequencesFromCuDNN(const Matrix; template class OptimizedRNNStackNode; +template class OptimizedRNNStackNode; }}} diff --git a/Source/ComputationNetworkLib/RecurrentNodes.cpp b/Source/ComputationNetworkLib/RecurrentNodes.cpp index 6d3527f60d2..375fac4ce67 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.cpp +++ b/Source/ComputationNetworkLib/RecurrentNodes.cpp @@ -341,6 +341,15 @@ template // truncated BPTT carry-over size_t T_delayedActivation = m_delayedActivationMBLayout ? m_delayedActivationMBLayout->GetNumTimeSteps() : 0; // (note: should never happen in full-sequence mode) auto tensorShape = GetTensorShape(rank); + + // Make sure we properly have the sequence length from the previous minibatch. + if (T_delayedActivation > 0 && HasMBLayout()) + { + auto dims = tensorShape.GetDims(); + dims[dims.size() - 1] = T_delayedActivation; + tensorShape = TensorShape(dims); + } + auto slice = TensorSliceWithMBLayoutFor(tensorShape.GetDims(), FrameRange(m_delayedActivationMBLayout, t_delayed/*<0*/ + T_delayedActivation), m_delayedActivationMBLayout); tensorShape.NarrowTo(slice); src = TensorView(m_delayedValue, tensorShape); @@ -578,9 +587,11 @@ template // instantiate the classes that derive from the above template class PastValueNode; template class PastValueNode; +template class PastValueNode; template class FutureValueNode; template class FutureValueNode; +template class FutureValueNode; // ----------------------------------------------------------------------- // DelayedValueNodeState -- helper class for exporting/importing state from/to DelayedValueNodes. diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index ad6f2accc3f..396f3492567 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -373,6 +373,7 @@ template template class ReduceElementsNode; template class ReduceElementsNode; +template class ReduceElementsNode; // ----------------------------------------------------------------------- // Where(bitVector) -- extract indices of non-0 values in a sequence @@ -490,6 +491,7 @@ template template class WhereNode; template class WhereNode; +template class WhereNode; // ----------------------------------------------------------------------- // PackedIndexNode(targetObject, indexSequence) -- map sequence @@ -551,6 +553,7 @@ template template class PackedIndexNode; template class PackedIndexNode; +template class PackedIndexNode; // ----------------------------------------------------------------------- // GatherPackedNode(packedIndex, sourceData) -- gather operation @@ -623,6 +626,7 @@ template template class GatherPackedNode; template class GatherPackedNode; +template class GatherPackedNode; // ----------------------------------------------------------------------- // ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation @@ -686,6 +690,7 @@ template template class ScatterPackedNode; template class ScatterPackedNode; +template class ScatterPackedNode; // ----------------------------------------------------------------------- // CropNode -- crop operation, crops first input according to shape of second @@ -1026,5 +1031,6 @@ bool CropNode::SupportsTransformOnInput(size_t inputIndex) template class CropNode; template class CropNode; +template class CropNode; }}} diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 857524377ff..c4ffc388cb1 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -878,8 +878,8 @@ class PaddingNode : public ComputationNode, public NumInputs<1> public: - PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector head, std::vector foot, PaddingType mode = PaddingType::CONSTANTPAD, ElemType constantValue = 0) - : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value(constantValue) + PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector head, std::vector foot, PaddingType mode = PaddingType::CONSTANTPAD, double constantValue = 0) + : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType)constantValue) { } diff --git a/Source/ComputationNetworkLib/SequenceReshapeNodes.h b/Source/ComputationNetworkLib/SequenceReshapeNodes.h index 04e49a676f4..1b531c0b5f4 100644 --- a/Source/ComputationNetworkLib/SequenceReshapeNodes.h +++ b/Source/ComputationNetworkLib/SequenceReshapeNodes.h @@ -312,8 +312,8 @@ class UnpackSequenceNode : public ComputationNodeNonLooping, public Mu public: DeclareConstructorFromConfig(UnpackSequenceNode); - UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, ElemType paddingValue = 0, bool suppressMaskOutput = false) - : Base(deviceId, name), MultiOutputNode(suppressMaskOutput ? 1 : 2), m_paddingValue(paddingValue), m_suppressMaskOutput(suppressMaskOutput) + UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, double paddingValue = 0, bool suppressMaskOutput = false) + : Base(deviceId, name), MultiOutputNode(suppressMaskOutput ? 1 : 2), m_paddingValue((ElemType)paddingValue), m_suppressMaskOutput(suppressMaskOutput) {} virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override diff --git a/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp b/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp index fbb030a80cb..dc9084f8fa4 100644 --- a/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp +++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp @@ -144,5 +144,6 @@ template template class TraceNode; template class TraceNode; +template class TraceNode; }}} diff --git a/Source/ComputationNetworkLib/SpecialPurposeNodes.h b/Source/ComputationNetworkLib/SpecialPurposeNodes.h index 2d140b1bc7a..9411d29f1f1 100755 --- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h +++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h @@ -8,6 +8,8 @@ #include "ComputationNode.h" #include "gammacalculation.h" #include "NonlinearityNodes.h" +#include "latticearchive.h" +#include "ProgressTracing.h" #include #include @@ -15,6 +17,8 @@ #include #include #include +#include +#include namespace Microsoft { namespace MSR { namespace CNTK { @@ -454,7 +458,7 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping, publ public: DeclareConstructorFromConfigWithNumInputs(SequenceWithSoftmaxNode); SequenceWithSoftmaxNode(DEVICEID_TYPE deviceId, const wstring& name) - : Base(deviceId, name), m_gammaCalcInitialized(false) + : Base(deviceId, name), m_gammaCalcInitialized(false), m_invalidMinibatch(false) { } @@ -469,11 +473,18 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping, publ } else if (inputIndex == 1) { - FrameRange fr(Input(0)->GetMBLayout()); - BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(), - Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold); - MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr); - + if (m_invalidMinibatch) + { + Input(inputIndex)->Gradient().SetValue(0.0f); + Value().SetValue(1.0f); + } + else + { + FrameRange fr(Input(0)->GetMBLayout()); + BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(), + Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold); + MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr); + } #ifdef _DEBUG Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout())); #endif @@ -542,6 +553,7 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping, publ m_gammaCalculator.init(m_hmm, m_deviceId); m_gammaCalcInitialized = true; } + // softmax m_logSoftmaxOfRight->AssignLogSoftmaxOf(Input(1)->Value() /*prediction*/, true); m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight); @@ -574,8 +586,7 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping, publ if (!(Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() && // match size Input(1)->GetSampleMatrixNumRows() == Input(2)->GetSampleMatrixNumRows() && Input(0)->HasMBLayout() && - Input(0)->GetMBLayout() == Input(1)->GetMBLayout() && - Input(0)->GetMBLayout() == Input(2)->GetMBLayout())) + Input(0)->GetMBLayout() == Input(1)->GetMBLayout())) { LogicError("The Matrix dimension in the SequenceWithSoftmaxNode operation does not match."); } @@ -653,13 +664,14 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping, publ shared_ptr> m_logSoftmaxOfRight; shared_ptr> m_softmaxOfRight; shared_ptr> m_gammaFromLattice; + bool m_invalidMinibatch; // for single minibatch double m_frameDropThreshold; double m_fsSmoothingWeight; // frame-sequence criterion interpolation weight --TODO: can this be done outside? double m_seqGammarAMF; double m_seqGammarLMF; double m_seqGammarWP; double m_seqGammarbMMIFactor; - double m_seqGammarUsesMBR; + bool m_seqGammarUsesMBR; bool m_doReferenceAlignment; std::vector> m_lattices; msra::asr::simplesenonehmm m_hmm; @@ -676,6 +688,270 @@ class SequenceWithSoftmaxNode : public ComputationNodeNonLooping, publ template class SequenceWithSoftmaxNode; template class SequenceWithSoftmaxNode; +// ----------------------------------------------------------------------- +// LatticeSequenceWithSoftmaxNode (label, prediction, loglikelihood, lattice) +// Similar to the SequenceWithSoftmaxNode, but is using the new deserializer. +// +// ----------------------------------------------------------------------- + +template +class LatticeSequenceWithSoftmaxNode : public SequenceWithSoftmaxNode, public NumInputs<4> +{ + typedef ComputationNodeNonLooping Base; + UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() + { + return L"LatticeSequenceWithSoftmax"; + } + +public: + LatticeSequenceWithSoftmaxNode(DEVICEID_TYPE deviceId, const std::wstring& name, const std::wstring& symListPath, const std::wstring& phonePath, const std::wstring& stateListPath, const std::wstring& transProbPath, const std::wstring& latticeConfigPath, + float hSmoothingWeight, float frameDropThresh, bool doReferenceAlign, bool seqGammarUsesMBR, float seqGammarAMF, float seqGammarLMF, float seqGammarBMMIFactor, float seqGammarWordPen) + : SequenceWithSoftmaxNode(deviceId, name), m_symListPath(symListPath), m_phonePath(phonePath), m_stateListPath(stateListPath), m_transProbPath(transProbPath), m_latticeConfigPath(latticeConfigPath) + { + if (sizeof(ElemType) != sizeof(float)) + LogicError("LatticeSequenceWithSoftmaxNode currently only supports floats.\n"); // due to the binary reader restrictions + + if (symListPath.size() == 0 || phonePath.size() == 0 || stateListPath.size() == 0 || transProbPath.size() == 0) + LogicError("Ensure that symListPath, phonePath, stateListPath and transProbPath parameters are specified.\n"); + + if (doReferenceAlign) + LogicError("SE training with alignment is currently not supported.\n"); + + LoadConfigsFromFile(); + + InitSEParams(m_symListPath, m_phonePath, m_stateListPath, m_transProbPath); + this->m_fsSmoothingWeight = hSmoothingWeight; + this->m_frameDropThreshold = frameDropThresh; + this->m_doReferenceAlignment = doReferenceAlign; + this->m_seqGammarUsesMBR = seqGammarUsesMBR; + this->m_seqGammarAMF = seqGammarAMF; + this->m_seqGammarLMF = seqGammarLMF; + this->m_seqGammarbMMIFactor = seqGammarBMMIFactor; + this->m_seqGammarWP = seqGammarWordPen; + + this->SetGammarCalculationParam(seqGammarAMF, seqGammarLMF, seqGammarWordPen, seqGammarBMMIFactor, seqGammarUsesMBR); + } + + LatticeSequenceWithSoftmaxNode(DEVICEID_TYPE deviceId, const std::wstring& name) + : SequenceWithSoftmaxNode(deviceId, name) + { + } + + LatticeSequenceWithSoftmaxNode(const ScriptableObjects::IConfigRecordPtr configp) + : LatticeSequenceWithSoftmaxNode(configp->Get(L"deviceId"), L"", configp->Get(L"symListPath"), configp->Get(L"phonePath"), configp->Get(L"stateListPath"), configp->Get(L"transProbPath"), configp->Get(L"latticeConfigPath"), + configp->Get(L"hSmoothingWeight"), configp->Get(L"frameDropThresh"), configp->Get(L"doReferenceAlign"), configp->Get(L"seqGammarUsesMBR"), configp->Get(L"seqGammarAMF"), configp->Get(L"seqGammarLMF"), configp->Get(L"seqGammarBMMIFactor"), configp->Get(L"seqGammarWordPen") + ) + { + AttachInputsFromConfig(configp, 4); + } + + // compute gradients to input observations, the weights to the observations, and the class log posterior probabilities + virtual void BackpropToNonLooping(size_t inputIndex) override + { + SequenceWithSoftmaxNode::BackpropToNonLooping(inputIndex); + } + + // -sum(left_i * log(softmax_i(right))) + virtual void ForwardPropNonLooping() + { + this->m_lattices.clear(); + this->m_uids.clear(); + this->m_boundaries.clear(); + this->m_extraUttMap.clear(); + this->m_invalidMinibatch = false; + + if (InputRef(3).ValuePtrRef()->GetDeviceId() != CPUDEVICE) + LogicError("Due to their size, lattices should be allocated on CPU memory"); + + const char* bufferStart = reinterpret_cast(InputRef(3).ValuePtrRef()->Data()); + + let& labelMBLayout = InputRef(0).GetMBLayout(); + const auto& labelSequences = labelMBLayout->GetAllSequences(); + + let& latticeMBLayout = InputRef(3).GetMBLayout(); + size_t latticeMBNumTimeSteps = latticeMBLayout->GetNumTimeSteps(); + + InputRef(0).ValuePtrRef()->VectorMax(*m_maxIndexes, *m_maxValues, true); + vector labelSequencesMap; + for (size_t i = 0; i < labelSequences.size(); i++) + { + if (labelSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + labelSequencesMap.push_back(labelSequences[i].seqId); + auto& currentLabelSeq = labelSequences[i]; + + // Fill up labels + auto columnIndices = labelMBLayout->GetColumnIndices(currentLabelSeq); + + for (size_t ci = 0; ci < columnIndices.size(); ci++) + { + size_t refId = (int)(*m_maxIndexes)(0, columnIndices[ci]); + this->m_uids.push_back(refId); + } + this->m_extraUttMap.push_back(labelSequences[i].s); + } + + this->m_lattices.resize(labelSequencesMap.size()); + try { +#pragma omp parallel for + for (long i = 0; i < labelSequences.size(); i++) + { + if (labelSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + + auto& currentLabelSeq = labelSequences[i]; + + // Fill up lattice + auto& currentLatticeSeq = latticeMBLayout->FindSequence(currentLabelSeq.seqId); + std::shared_ptr latticePair(new msra::dbn::latticepair); + const char* buffer = bufferStart + latticeMBNumTimeSteps * sizeof(float) * currentLatticeSeq.s + currentLatticeSeq.tBegin; + latticePair->second.ReadFromBuffer(buffer, m_idmap, m_idmap.back()); + assert((currentLabelSeq.tEnd - currentLabelSeq.tBegin) == latticePair->second.info.numframes); + // The size of the vector is small -- the number of sequences in the minibatch. + // Iteration likely will be faster than the overhead with unordered_map + for (size_t pos = 0; pos < labelSequencesMap.size();pos++) + { + if (labelSequencesMap[pos] == labelSequences[i].seqId) + { + this->m_lattices[pos] = latticePair; + break; + } + } + } + } + catch (...) + { + fprintf(stderr, "WARNING: Failed to parse lattice. Skipping minibatch...\n"); + this->m_invalidMinibatch = true; + } + + if (!this->m_invalidMinibatch) + { + this->m_boundaries.resize(this->m_uids.size()); + std::fill(this->m_boundaries.begin(), this->m_boundaries.end(), 0); + SequenceWithSoftmaxNode::ForwardPropNonLooping(); + } + } + + virtual void Save(File& fstream) const override + { + Base::Save(fstream); + fstream << m_symListPath; + fstream << m_phonePath; + fstream << m_stateListPath; + fstream << m_transProbPath; + fstream << m_latticeConfigPath; + fstream << this->m_frameDropThreshold; + fstream << this->m_fsSmoothingWeight; + fstream << this->m_seqGammarAMF; + fstream << this->m_seqGammarLMF; + fstream << this->m_seqGammarWP; + fstream << this->m_seqGammarbMMIFactor; + fstream << this->m_seqGammarUsesMBR; + fstream << this->m_doReferenceAlignment; + } + + virtual void Load(File& fstream, size_t modelVersion) override + { + Base::Load(fstream, modelVersion); + fstream >> m_symListPath; + fstream >> m_phonePath; + fstream >> m_stateListPath; + fstream >> m_transProbPath; + fstream >> m_latticeConfigPath; + fstream >> this->m_frameDropThreshold; + fstream >> this->m_fsSmoothingWeight; + fstream >> this->m_seqGammarAMF; + fstream >> this->m_seqGammarLMF; + fstream >> this->m_seqGammarWP; + fstream >> this->m_seqGammarbMMIFactor; + fstream >> this->m_seqGammarUsesMBR; + fstream >> this->m_doReferenceAlignment; + LoadConfigsFromFile(); + InitSEParams(m_symListPath, m_phonePath, m_stateListPath, m_transProbPath); + this->SetGammarCalculationParam(this->m_seqGammarAMF, this->m_seqGammarLMF, this->m_seqGammarWP, this->m_seqGammarbMMIFactor, this->m_seqGammarUsesMBR); + } + + void LoadConfigsFromFile() + { + // Workaround for loading a trained model from a different location + std::wstring_convert, wchar_t> converter; + std::string latticeConfigPathStr = converter.to_bytes(m_latticeConfigPath); + wifstream file(latticeConfigPathStr.c_str()); + if (file.good()) + { + wstring str; + getline(file, str); + m_symListPath = str; + getline(file, str); + m_phonePath = str; + getline(file, str); + m_stateListPath = str; + getline(file, str); + m_transProbPath = str; + } + } + + virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override + { + SequenceWithSoftmaxNode::Validate(isFinalValidationPass); + + if (isFinalValidationPass) + { + // Make sure lattices are pre allocated on CPU, due to their size. + Input(3)->ValuePtrRef()->TransferToDeviceIfNotThere(CPUDEVICE, true /*moving completely*/, true /*preserving no data*/); + } + } + + virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override + { + SequenceWithSoftmaxNode::CopyTo(nodeP, newName, flags); + + if (flags & CopyNodeFlags::copyNodeValue) + { + auto node = dynamic_pointer_cast>(nodeP); + + if (node) + { + node->m_idmap = m_idmap; + node->m_symListPath = m_symListPath; + node->m_phonePath = m_phonePath; + node->m_stateListPath = m_stateListPath; + node->m_stateListPath = m_transProbPath; + } + } + } + + // request matrices needed to do node function value evaluation + virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) + { + SequenceWithSoftmaxNode::RequestMatricesBeforeForwardProp(matrixPool); + RequestMatrixFromPool(m_maxIndexes, matrixPool); + RequestMatrixFromPool(m_maxValues, matrixPool); + } + +private: + msra::lattices::archive::symbolidmapping m_idmap; + std::wstring m_symListPath; + std::wstring m_phonePath; + std::wstring m_stateListPath; + std::wstring m_transProbPath; + std::wstring m_latticeConfigPath; + shared_ptr> m_maxIndexes, m_maxValues; + + void InitSEParams(const std::wstring& symListPath, const std::wstring& phonePath, const std::wstring& stateListPath, const std::wstring& transProbPath) + { + LOGPRINTF(stderr, "Reading files\n %ls \n %ls \n %ls \n %ls \n", symListPath.c_str(), phonePath.c_str(), stateListPath.c_str(), transProbPath.c_str()); + this->m_hmm.loadfromfile(phonePath, stateListPath, transProbPath); + auto symmap = this->m_hmm.getsymmap(); + msra::lattices::archive::GetSymList(m_idmap, symListPath, symmap); + } +}; + +template class LatticeSequenceWithSoftmaxNode; +template class LatticeSequenceWithSoftmaxNode; + // ----------------------------------------------------------------------- // DummyCriterionNode (objectiveValues, userSuppliedGradient, prediction) // TODO: Rename to CustomCriterionNode? diff --git a/Source/ComputationNetworkLib/TrainingNodes.cpp b/Source/ComputationNetworkLib/TrainingNodes.cpp index e27f406094a..ccb68e1f04a 100644 --- a/Source/ComputationNetworkLib/TrainingNodes.cpp +++ b/Source/ComputationNetworkLib/TrainingNodes.cpp @@ -47,6 +47,7 @@ template template class RandomDistributionNode; template class RandomDistributionNode; +template class RandomDistributionNode; template void RandomSampleNodeBase::Validate(bool isFinalValidationPass) @@ -108,9 +109,9 @@ void RandomSampleNodeBase::UpdateWeightsPrefixSum() { ElemType currentWeight = samplingWeights.GetValue(iClass, 0); if (currentWeight < 0) - InvalidArgument("Sampling weights contain negative number %f.", currentWeight); + InvalidArgument("Sampling weights contain negative number %f.", (float)currentWeight); - runningWeightsSum += currentWeight; + runningWeightsSum += (double)currentWeight; m_samplingWeightsPrefixSum.push_back(runningWeightsSum); } } @@ -231,6 +232,7 @@ bool RandomSampleNode::IsOutOfDateWrtInputs() const template class RandomSampleNode; template class RandomSampleNode; +template class RandomSampleNode; template double RandomSampleInclusionFrequencyNode::EstimateNumberOfTries() @@ -303,6 +305,7 @@ void RandomSampleInclusionFrequencyNode::Validate(bool isFinalValidati template class RandomSampleInclusionFrequencyNode; template class RandomSampleInclusionFrequencyNode; +template class RandomSampleInclusionFrequencyNode; template void DropoutNode::Save(File& fstream) const @@ -336,8 +339,10 @@ void BatchNormalizationNode::AttachInputs(const std::vector; template class DropoutNode; +template class DropoutNode; template class BatchNormalizationNode; template class BatchNormalizationNode; +template class BatchNormalizationNode; }}} diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h index efa780657a2..523f2b51d50 100644 --- a/Source/ComputationNetworkLib/TrainingNodes.h +++ b/Source/ComputationNetworkLib/TrainingNodes.h @@ -525,7 +525,7 @@ class LambdaRankNode : public ComputationNodeNonLooping /*ComputationNode*/, publi typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; static const std::wstring TypeName() { return L"BatchNormalization"; } + typedef typename std::conditional::value, float, ElemType>::type StatType; + // inputs // TODO: Change all of these throughout the codebase to 'class enum'. Also change all places where we still use integer constants. static const size_t DATA = 0; @@ -2412,7 +2414,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi m_one(1, 1, deviceId), m_convertRunningVariancePending(false) { - m_one.SetValue((ElemType)1); // (constant value used for GPU-side update of runCount) + m_one.SetValue((StatType)1); // (constant value used for GPU-side update of runCount) } BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) : BatchNormalizationNode(configp->Get(L"deviceId"), L"", configp->Get(L"spatial"), @@ -2564,14 +2566,14 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi void ResetRunCount() { if (HasTiedRunCount()) - Input(RUN_COUNT)->Value().SetValue(0); + this->template TypedInput(RUN_COUNT)->Value().SetValue(0); m_runCountUntied = 0; } void AggregateRunCount(size_t countToAdd) { if (HasTiedRunCount()) { - Input(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(ElemType)countToAdd, m_one); // this += countToAdd * (1) + this->template TypedInput(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(StatType)countToAdd, m_one); // this += countToAdd * (1) if (countToAdd != 0) m_runCountUntied = SIZE_MAX; // we only need this for 0 checks, this value says we only know it's not 0 } @@ -2581,7 +2583,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi size_t RunCount() const // const version of above; keep identical { if (HasTiedRunCount()) - m_runCountUntied = (size_t)Input(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over + m_runCountUntied = (size_t)this->template TypedInput(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over return m_runCountUntied; } bool IsRunCount0() const { return m_runCountUntied == 0 && RunCount() == 0; } // tied count >= untied one, so we can ask the untied one first to avoid GPU sync @@ -2664,10 +2666,10 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi FrameRange fr(Input(DATA)->GetMBLayout()); Matrix sliceInputValue = Input(DATA)->MaskedValueFor(fr); - const Matrix& scale = Input(SCALE)->Value(); - const Matrix& bias = Input(BIAS)->Value(); - Matrix& runMean = Input(RUN_MEAN)->Value(); - Matrix& runVariance = Input(RUN_VAR)->Value(); + const Matrix& scale = this->template TypedInput(SCALE)->Value(); + const Matrix& bias = this->template TypedInput(BIAS)->Value(); + Matrix& runMean = this->template TypedInput(RUN_MEAN)->Value(); + Matrix& runVariance = this->template TypedInput(RUN_VAR)->Value(); Matrix sliceOutputValue = ValueFor(fr); assert(scale.GetNumRows() == bias.GetNumRows()); @@ -2718,8 +2720,8 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi { auto sliceOutputGrad = MaskedGradientFor(fr); auto sliceInputValue = Input(DATA)->ValueFor(fr); - const Matrix& scale = Input(SCALE)->Value(); - const Matrix& bias = Input(BIAS)->Value(); + const Matrix& scale = this->template TypedInput(SCALE)->Value(); + const Matrix& bias = this->template TypedInput(BIAS)->Value(); // If inputIndex is not DATA and we get here, then it means that DATA receives no gradient. // However, the underlying engine does not foresee this case, and thus always needs a place @@ -2752,19 +2754,19 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi { assert(m_gradientValid); - if (Input(SCALE)->IsGradientInitializedBy(this)) - Input(SCALE)->Gradient().AssignValuesOf(*m_dScale); + if (this->template TypedInput(SCALE)->IsGradientInitializedBy(this)) + this->template TypedInput(SCALE)->Gradient().AssignValuesOf(*m_dScale); else - Input(SCALE)->Gradient() += *m_dScale; + this->template TypedInput(SCALE)->Gradient() += *m_dScale; } else if (inputIndex == BIAS) // derivative with respect to the bias, precomputed during input derivative computation { assert(m_gradientValid); - if (Input(BIAS)->IsGradientInitializedBy(this)) - Input(BIAS)->Gradient().AssignValuesOf(*m_dBias); + if (this->template TypedInput(BIAS)->IsGradientInitializedBy(this)) + this->template TypedInput(BIAS)->Gradient().AssignValuesOf(*m_dBias); else - Input(BIAS)->Gradient() += *m_dBias; + this->template TypedInput(BIAS)->Gradient() += *m_dBias; } // No derivatives with respect to running mean and variance. } @@ -2797,7 +2799,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi // running statistics inputs must be learnable parameters, since we update them directly here for (size_t i = RUN_MEAN; i < GetNumInputs(); i++) //if (!Input(i)->Is>()) // somehow this does not compile on gcc (works on VS) - if (!dynamic_cast*>(Input(i).get())) + if (!dynamic_cast*>(this->template TypedInput(i).get())) InvalidArgument("%ls: Inputs [%d..%d] must be learnable parameters.", NodeDescription().c_str(), (int)RUN_MEAN, (int)GetNumInputs()); // infer dimensions of learnable parameters @@ -2807,7 +2809,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi #if 1 // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed. for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance { - auto paramLayout = Input(i)->GetSampleLayout(); + auto paramLayout = this->template TypedInput(i)->GetSampleLayout(); if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1] { size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements(); @@ -2841,10 +2843,11 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi // check inputs for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance { - if (Input(i)->HasMBLayout()) + auto inputPtr = this->template TypedInput(i); + if (inputPtr->HasMBLayout()) InvalidArgument("%ls: Input[%d] has a dynamic axis. BatchNormalization parameters cannot have that.", NodeDescription().c_str(), (int)i); - auto paramLayout = Input(i)->GetSampleLayout(); - if (paramLayout != Input(SCALE)->GetSampleLayout()) + auto paramLayout = inputPtr->GetSampleLayout(); + if (paramLayout != this->template TypedInput(SCALE)->GetSampleLayout()) InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i); #if 0 // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference). if (paramLayout.GetRank() > inputLayout.GetRank()) @@ -2857,9 +2860,9 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi if (HasTiedRunCount()) // 0-th order statistics (count) (optional for backcompat with old code which didn't correctly share it) { // This must always be a [1] tensor. No inference allowed. - size_t i = RUN_COUNT; - if (Input(i)->HasMBLayout() || (Input(i)->GetSampleLayout().GetRank() > 1) || (Input(i)->GetSampleLayout().GetNumElements() != 1)) - InvalidArgument("%ls: Input[%d] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str(), (int)i); + auto inputPtr = this->template TypedInput(RUN_COUNT); + if (inputPtr->HasMBLayout() || (inputPtr->GetSampleLayout().GetRank() > 1) || (inputPtr->GetSampleLayout().GetNumElements() != 1)) + InvalidArgument("%ls: Input[RUN_COUNT] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str()); RunCount(); // cache the shared value into the local cache, for 0 checks } if (m_spatial && m_imageLayoutKind != CHW) @@ -2902,7 +2905,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi if (m_bnEng == nullptr) { auto shape = GetSampleLayout(); - m_bnEng = BatchNormEngine::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind, + m_bnEng = BatchNormEngine::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind, m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn); } @@ -2916,26 +2919,26 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override { Base::RequestMatricesBeforeForwardProp(matrixPool); - RequestMatrixFromPool(m_savedMean, matrixPool); - RequestMatrixFromPool(m_savedInvStdDev, matrixPool); + this->template TypedRequestMatrixFromPool(m_savedMean, matrixPool); + this->template TypedRequestMatrixFromPool(m_savedInvStdDev, matrixPool); } void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override { Base::RequestMatricesBeforeBackprop(matrixPool); RequestMatrixFromPool(m_dDataDummy, matrixPool); - RequestMatrixFromPool(m_dScale, matrixPool); - RequestMatrixFromPool(m_dBias, matrixPool); + this->template TypedRequestMatrixFromPool(m_dScale, matrixPool); + this->template TypedRequestMatrixFromPool(m_dBias, matrixPool); } void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override { Base::ReleaseMatricesAfterBackprop(matrixPool); - ReleaseMatrixToPool(m_savedMean, matrixPool); - ReleaseMatrixToPool(m_savedInvStdDev, matrixPool); + this->template TypedReleaseMatrixToPool(m_savedMean, matrixPool); + this->template TypedReleaseMatrixToPool(m_savedInvStdDev, matrixPool); ReleaseMatrixToPool(m_dDataDummy, matrixPool); - ReleaseMatrixToPool(m_dScale, matrixPool); - ReleaseMatrixToPool(m_dBias, matrixPool); + this->template TypedReleaseMatrixToPool(m_dScale, matrixPool); + this->template TypedReleaseMatrixToPool(m_dBias, matrixPool); } void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant, @@ -2970,8 +2973,8 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi // Turn off the L1 and L2 regularization void DisableRegInBatchNormalization() { - let scaleNode = dynamic_pointer_cast>(Input(SCALE)); - let biasNode = dynamic_pointer_cast>(Input(BIAS)); + let scaleNode = dynamic_pointer_cast>(this->template TypedInput(SCALE)); + let biasNode = dynamic_pointer_cast>(this->template TypedInput(BIAS)); scaleNode->SetRegMultiplier(0.f); biasNode->SetRegMultiplier(0.f); } @@ -3046,20 +3049,20 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi // This value is not updated unless needed, so it may be out of date during most operation. // It will be updated at start (Validate()) and saving models, and any time the true value is needed. mutable size_t m_runCountUntied; // cached running sample count (mutable since it is a cache) - Matrix m_one; // constant [1x1] matrix that contains a 1 (used for updating the shared count) + Matrix m_one; // constant [1x1] matrix that contains a 1 (used for updating the shared count) // Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation. - shared_ptr> m_savedMean; - shared_ptr> m_savedInvStdDev; + shared_ptr> m_savedMean; + shared_ptr> m_savedInvStdDev; // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls. // Not used for blendFactor=1 in CNTK engine. shared_ptr> m_dDataDummy; - shared_ptr> m_dScale; - shared_ptr> m_dBias; + shared_ptr> m_dScale; + shared_ptr> m_dBias; bool m_gradientValid = false; - std::unique_ptr> m_bnEng; + std::unique_ptr> m_bnEng; bool m_convertRunningVariancePending; }; diff --git a/Source/EvalDll/EvalDll.vcxproj b/Source/EvalDll/EvalDll.vcxproj index 545fbed2a16..2bb49689599 100644 --- a/Source/EvalDll/EvalDll.vcxproj +++ b/Source/EvalDll/EvalDll.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -33,13 +33,11 @@ DynamicLibrary true - v140 Unicode DynamicLibrary false - v140 true Unicode diff --git a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.cpp b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.cpp similarity index 100% rename from Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.cpp rename to Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.cpp diff --git a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj similarity index 81% rename from Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj rename to Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj index a4ee9c19290..8ec6d4cf148 100644 --- a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj +++ b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -22,11 +22,12 @@ x64 - - + + + {20dee94f-2802-40b1-b88b-22755a03aa48} @@ -39,13 +40,11 @@ DynamicLibrary true - v140 Unicode DynamicLibrary false - v140 true Unicode @@ -57,18 +56,18 @@ true - Cntk.BinaryConvolutionExample-$(CntkComponentVersion) + Cntk.BinaryConvolution-$(CntkComponentVersion) false - Cntk.BinaryConvolutionExample-$(CntkComponentVersion) + Cntk.BinaryConvolution-$(CntkComponentVersion) - $(SolutionDir)Source\CNTKv2LibraryDll\API + $(SolutionDir)Source\CNTKv2LibraryDll\API;$(HalideInclude) - $(OutDir);$(ProjectDir)\halide;$(SolutionDir)$(Platform)\$(Configuration) + $(OutDir);$(HalideLibPath);$(SolutionDir)$(Platform)\$(Configuration) @@ -84,7 +83,7 @@ Console true - Cntk.Core-$(CntkComponentVersion).lib;halide_convolve_nofeatures.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + Cntk.Core-$(CntkComponentVersion).lib;$(HalideLib);kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) @@ -99,13 +98,15 @@ Speed /d2Zi+ %(AdditionalOptions) false + + Console true true true - Cntk.Core-$(CntkComponentVersion).lib;halide_convolve_nofeatures.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + Cntk.Core-$(CntkComponentVersion).lib;$(HalideLib);kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) @@ -118,4 +119,4 @@ - \ No newline at end of file + diff --git a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj.filters b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj.filters similarity index 90% rename from Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj.filters rename to Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj.filters index 1180532188d..a494fafcfdb 100644 --- a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj.filters +++ b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj.filters @@ -23,5 +23,8 @@ Header Files + + Header Files + - + \ No newline at end of file diff --git a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolveOp.h b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolveOp.h similarity index 53% rename from Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolveOp.h rename to Source/Extensibility/BinaryConvolutionLib/BinaryConvolveOp.h index 70f1abc4b0c..b6e2a94fd8c 100644 --- a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolveOp.h +++ b/Source/Extensibility/BinaryConvolutionLib/BinaryConvolveOp.h @@ -5,11 +5,85 @@ // This file contains an implementation of single bit binarization using an optimized halide function call +#include "halide_binary_convolve.h" #include "CNTKLibrary.h" -#include "convolve_wrapper.h" using namespace CNTK; +int convolutional_out_size(int x, int size, int stride, bool pad) +{ + if (!pad) x -= size; + else x -= 1; + return x/stride + 1; +} + +void binarize_array(const float *input, int size, int64_t *binary) +{ + for (int i = 0; i < size; ++i) { + int index = i; + int block = index/64; + int bit = index%64; + float input_val = input[index]; + if (input_val > 0) { + binary[block] |= ((uint64_t) 1 << bit); + } else { + binary[block] &= ~((uint64_t) 1 << bit); + } + } +} + +float pad_mask_check_pixel(int height, int width, int channels, + int row, int col, int channel, int pad) +{ + row -= pad; + col -= pad; + + if (row < 0 || col < 0 || + row >= height || col >= width) return false; + return true; +} + +void get_pad_mask(int channels, int height, int width, + int ksize, int stride, int pad, int64_t* pad_mask) +{ + int c,h,w; + int height_col = (height - ksize) / stride + 1; + int width_col = (width - ksize) / stride + 1; + int filter_size = ksize*ksize*channels; + int bit; + int block; + // pad just indicates that you want your windows to fit in nicely, add however many 0s as is needed (ksize/2) to make that happen, + // means pad should either be 1 or 0 in cfg file + if (pad){ + height_col = 1 + (height-1) / stride; + width_col = 1 + (width-1) / stride; + pad = ksize/2; + } + int output_size = height_col * width_col; + for (c = 0; c < output_size; ++c) { + int block_start = c * ((filter_size - 1)/64 + 1); + int w_offset = (c*stride) % width_col; + int h_offset = ((c*stride) / width_col) % height_col; + for (h = 0; h < channels; ++h) { + for (w = 0; w < (ksize*ksize); ++w) { + int im_row = h_offset + (w / ksize); + int im_col = w_offset + (w % ksize); + int col_offset = (h * ksize*ksize) + w; + // note that data col is an array of uint64 values, find which uint64 has the bit we want to set + block = block_start + (col_offset/64); + // now find the bit in that block that needs to be set + bit = col_offset % 64; + // finally, set or clear that bit + if (pad_mask_check_pixel(height, width, channels, im_row, im_col, h, pad)) { + pad_mask[block] |= ((uint64_t) 1 << bit); + } else { + pad_mask[block] &= ~((uint64_t) 1 << bit); + } + } + } + } +} + class BinaryConvolveFunction final : public Function { public: @@ -27,16 +101,31 @@ class BinaryConvolveFunction final : public Function // declares our function as a subset of the Function class and maps the dictionary values in BinaryConvolveFunction(const Variable& leftOperand, const Variable& rightOperand, const Dictionary& attributes, const std::wstring& name) : Function({ leftOperand, rightOperand }, Dictionary(attributes), name), Attr(Dictionary(attributes)) - {} + { + w = Attr[w_key].Value(); + h = Attr[h_key].Value(); + size = Attr[size_key].Value(); + stride = Attr[stride_key].Value(); + pad = Attr[pad_key].Value(); + channels = Attr[channels_key].Value(); + filters = Attr[filters_key].Value(); + out_h = convolutional_out_size(h, size, stride, pad); + out_w = convolutional_out_size(w, size, stride, pad); + const NDArrayViewPtr& weight_array = leftOperand.GetValue(); + weight_data = weight_array->DataBuffer(); + binary_weights = (int64_t *) malloc(((size*size*channels)/64)*filters*sizeof(int64_t)); + pad_mask = (int64_t *) malloc((size*size*channels/64)*out_h*out_w*sizeof(int64_t)); + binarize_array(weight_data, size*size*channels*filters, binary_weights); + Executor = new HalideBinaryConvolve(binary_weights, pad_mask, w, h, channels, filters, size, stride, pad); + } private: // simple convolve function that pulls out raw data buffers and passes them into our halide function - static void Convolve(const NDArrayViewPtr& weights, const NDArrayViewPtr& input, const int size, const int stride, const bool pad, const int w, const int h, const int channels, const int num_filters, NDArrayViewPtr& output) + void Convolve(const NDArrayViewPtr& input, NDArrayViewPtr& output) { - auto weightBuffer = weights->DataBuffer(); auto inputBuffer = input->DataBuffer(); auto outBuffer = output->WritableDataBuffer(); - invoke_halide_convolve(weightBuffer, inputBuffer, num_filters, size, channels, pad, stride, w, h, outBuffer); + Executor->realize(inputBuffer, outBuffer); } // forward function definition, needs to parse the data and call into the Convolve function @@ -49,22 +138,6 @@ class BinaryConvolveFunction final : public Function auto leftOperandData = inputValues[0]->Data(); // pull out the activation data from inputValues auto rightOperandData = inputValues[1]->Data(); - // determine the number of filters in the input - auto kernelRank = leftOperandData->Shape().Rank(); - long unsigned int num_filters; - if (kernelRank >= 4) { - num_filters = (long unsigned int)leftOperandData->Shape()[3]; - } else { - num_filters = 1; - } - // extract some basic information that is needed by halide - auto channels = leftOperandData->Shape()[2]; - auto w = rightOperandData->Shape()[0]; - auto h = rightOperandData->Shape()[1]; - - auto pad = Attr[padkey].Value(); - auto size = Attr[sizekey].Value(); - auto stride = Attr[stridekey].Value(); // Allocate outputValue if needed auto& outputValue = outputs[this->Output()]; @@ -72,13 +145,13 @@ class BinaryConvolveFunction final : public Function { auto numOutCols = !pad ? (w - size)/stride + 1 : (w - 1)/stride + 1; auto numOutRows = !pad ? (h - size)/stride + 1 : (h - 1)/stride + 1; - outputValue = MakeSharedObject(MakeSharedObject(DataType::Float, NDShape({ numOutRows , numOutCols, num_filters }), computeDevice)); + outputValue = MakeSharedObject(MakeSharedObject(DataType::Float, NDShape({ (long unsigned int) numOutRows, (long unsigned int) numOutCols, (long unsigned int) filters }), computeDevice)); } // extract the output data auto outputData = outputValue->Data(); // pass everything to Halide to compute the result, outputs are directly stored in the outputData buffer - Convolve(leftOperandData, rightOperandData, size, stride, pad, (int)w, (int)h, (int)channels, (int)num_filters, outputData); + Convolve(rightOperandData, outputData); // Let's save the right input's Value in the BackPropSate to be used in the backward pass for computing gradients return MakeSharedObject(this->shared_from_this(), computeDevice, std::unordered_map({ {Inputs()[1], inputValues[1] } })); @@ -103,9 +176,26 @@ class BinaryConvolveFunction final : public Function size_t CurrentVersion() const override { NOT_IMPLEMENTED; } // create a dictionary of attributes with a few specific keys const Dictionary Attr; - const wchar_t* padkey = L"padding"; - const wchar_t* stridekey = L"stride"; - const wchar_t* sizekey = L"size"; + const wchar_t* pad_key = L"padding"; + const wchar_t* stride_key = L"stride"; + const wchar_t* size_key = L"size"; + const wchar_t* w_key = L"w"; + const wchar_t* h_key = L"h"; + const wchar_t* channels_key = L"channels"; + const wchar_t* filters_key = L"filters"; + bool pad; + int stride; + int size; + int w; + int h; + int channels; + int filters; + int out_w; + int out_h; + int64_t *binary_weights; + int64_t *pad_mask; + const float *weight_data; + HalideBinaryConvolve *Executor; // Compute the dimensions of the output variable and return the proper shape and dynamic axes void InferOutputs(std::vector& outputs) override @@ -125,9 +215,9 @@ class BinaryConvolveFunction final : public Function auto w = rightOperand.Shape()[0]; auto h = rightOperand.Shape()[1]; - auto pad = Attr[padkey].Value(); - auto size = Attr[sizekey].Value(); - auto stride = Attr[stridekey].Value(); + auto pad = Attr[pad_key].Value(); + auto size = Attr[size_key].Value(); + auto stride = Attr[stride_key].Value(); // compute the output dimensions auto numOutCols = !pad ? (w - size)/stride + 1 : (w - 1)/stride + 1; diff --git a/Source/Extensibility/BinaryConvolutionLib/halide_binary_convolve.h b/Source/Extensibility/BinaryConvolutionLib/halide_binary_convolve.h new file mode 100644 index 00000000000..294a7213fbb --- /dev/null +++ b/Source/Extensibility/BinaryConvolutionLib/halide_binary_convolve.h @@ -0,0 +1,96 @@ +#ifndef HALIDE_BINARY_CONVOLVE +#define HALIDE_BINARY_CONVOLVE + +#include "Halide.h" + +using namespace Halide; + +class HalideBinaryConvolve { + Buffer input; + Func output; + Target t; + Buffer weights; + Buffer pad_mask_buf; + int filters; + int size; + int stride; + bool pad; + int w; + int h; + int channels; + int out_x; + int out_y; + int bin_width; +public: + HalideBinaryConvolve(int64_t *W_in, int64_t *pad_mask, int w, int h, int channels, int filters, int size, int stride, bool pad, bool gpu = false) : + input(Buffer(w,h,channels)), + weights(Buffer(W_in, (size*size*channels - 1)/64 + 1, filters)), + pad_mask_buf(Buffer(pad_mask, (size*size*channels - 1)/64 + 1, (!pad ? (w - size) / stride + 1 : (w - 1)/stride + 1)*(!pad ? (h - size) / stride + 1 : (h - 1)/stride + 1))), + filters(filters), + size(size), + stride(stride), + pad(pad), + w(w), + h(h), + channels(channels), + out_x(!pad ? (w - size) / stride + 1 : (w - 1)/stride + 1), + out_y(!pad ? (h - size) / stride + 1 : (h - 1)/stride + 1), + bin_width((size*size*channels - 1)/64 + 1), + t(get_host_target()) + { + Var x("x"), y("y"), c("c"), f("f"), k("k"); + Func Input("Input"); + Input(x, y, c) = BoundaryConditions::constant_exterior(input, 0)(x, y, c); + + Func binarize_input("binarize_input"), bit_mask("bit_mask"), mask_count("mask_count"); + RDom r(0, 64); + + Expr w_offset = (y % out_x)*stride; + Expr h_offset = ((y / out_x) % out_y) * stride; + + Expr im_row = h_offset + ((64*x + r.x)/size) % size - select(pad, size/2, 0); + Expr im_col = w_offset + (64*x + r.x) % size - select(pad, size/2, 0); + Expr im_chan = (64*x + r.x) / size / size; + + RDom bw(0, bin_width); + + binarize_input(x, y) = sum(select(Input(im_col, im_row, im_chan) > 0, cast(1) << r.x, cast(0)), "compress_inputs"); + //bit_mask(x, y) = sum(select((im_row < 0 || im_col < 0 || im_row >= input.height() || im_col >= input.width()), cast(0) << r.x, cast(1) << r.x), "make_bitmask"); + bit_mask(x, y) = pad_mask_buf(x, y); + mask_count(y) = sum(popcount(~bit_mask(bw.x, y)), "mask_count"); + + Func binarize_weights("binarize_weights"); + //RDom n(0, weights.width()); + //binarize_weights(x, f) = sum(select(weights(64*x + r.x, f) > 0, (cast(1)) << r.x, cast(0)), "compress_weights"); + binarize_weights(x, f) = weights(x, f); + + Func xnor("xnor"); + xnor(k, x, y) = (popcount(bit_mask(k, x) & (binarize_weights(k, y) ^ binarize_input(k, x)))); + + output(x, y) = -((2 * cast(sum(xnor(bw.x, x, y), "accumulate"))) - (64*bin_width) + mask_count(x)); + if (!gpu) { + //output.reorder(y, x); + //output.vectorize(y, 8); + //output.parallel(x, 8); + //binarize_input.compute_at(output, x); + //bit_mask.compute_at(output, x); + output.compute_root(); + output.parallel(y, 8); + output.vectorize(x, 8); + binarize_input.store_root().compute_root(); + binarize_input.vectorize(x, 8); + binarize_input.parallel(y, 8); + //bit_mask.compute_root(); + //t.set_feature(Target::Profile); + } + output.compile_jit(t); + } + + void realize(const float *in_array, float *out_array) { + Buffer outbuf = Buffer(out_array, out_x*out_y, filters); + std::memcpy(input.get()->data(), in_array, w*h*channels*sizeof(float)); + output.realize(outbuf); + } +}; + +#endif diff --git a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj index 8ea46842d9e..11d632d8a75 100644 --- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj +++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -33,14 +33,12 @@ DynamicLibrary true - v140 true Unicode DynamicLibrary false - v140 true Unicode @@ -64,7 +62,7 @@ $(OutDir) Cntk.Eval-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll + Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll diff --git a/Source/ImageWriterDll/ImageWriterDll.vcxproj b/Source/ImageWriterDll/ImageWriterDll.vcxproj index 4fb88cca2fd..dc2cee6e6e7 100644 --- a/Source/ImageWriterDll/ImageWriterDll.vcxproj +++ b/Source/ImageWriterDll/ImageWriterDll.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -33,13 +33,11 @@ DynamicLibrary true - v140 Unicode DynamicLibrary false - v140 true Unicode @@ -64,7 +62,7 @@ $(SolutionDir)$(Platform)\$(Configuration);$(OpenCvLibPath) - Cntk.Common-$(CntkComponentVersion).lib;$(OpenCvLib);%(AdditionalDependencies) + $(OpenCvLib);%(AdditionalDependencies) diff --git a/Source/ImageWriterDll/ImageWriterDll.vcxproj.filters b/Source/ImageWriterDll/ImageWriterDll.vcxproj.filters index d098fd98736..ef6936a2728 100644 --- a/Source/ImageWriterDll/ImageWriterDll.vcxproj.filters +++ b/Source/ImageWriterDll/ImageWriterDll.vcxproj.filters @@ -2,9 +2,7 @@ - - Misc - + diff --git a/Source/Math/BatchNormalizationEngine.cpp b/Source/Math/BatchNormalizationEngine.cpp index d94a0349efe..a2ea8033591 100644 --- a/Source/Math/BatchNormalizationEngine.cpp +++ b/Source/Math/BatchNormalizationEngine.cpp @@ -6,13 +6,13 @@ #include "stdafx.h" #include "BatchNormalizationEngine.h" #include "CuDnnFactories.h" -#include "Mkl2017DnnCommon.h" +#include "MklDnnCommon.h" namespace Microsoft { namespace MSR { namespace CNTK { -template -void BatchNormEngine::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) +template +void BatchNormEngine::Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance, + InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) { assert(in.GetNumRows() == m_inOutT.GetNumElements()); assert(out.GetNumRows() == m_inOutT.GetNumElements()); @@ -63,9 +63,9 @@ void BatchNormEngine::Forward(const Mat& in, const Mat& scale, const M } } -template -void BatchNormEngine::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, - const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) +template +void BatchNormEngine::Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, + const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) { assert(!savedMean.IsEmpty()); assert(!savedInvStdDev.IsEmpty()); @@ -73,12 +73,13 @@ void BatchNormEngine::Backward(const Mat& in, const Mat& srcGrad, Mat& BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad); } -template -class CntkBatchNormEngine : public BatchNormEngine +template +class CntkBatchNormEngine : public BatchNormEngine { public: - using Base = BatchNormEngine; - using typename Base::Mat; + using Base = BatchNormEngine; + using typename Base::InoutMat; + using typename Base::StatMat; public: CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT, @@ -99,28 +100,30 @@ class CntkBatchNormEngine : public BatchNormEngine InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout."); } - void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override + void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance, + InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override { #ifdef USE_MKL2017DNN if (in.GetCurrentMatrixLocation() == CPU && - ForwardCoreMKL(in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev)) + std::is_same::value && + ForwardCoreMKL(*(const StatMat*)&in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, *(StatMat*)&out, epsilon, savedMean, savedInvStdDev)) return; #endif in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev); } - void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev, - Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override + void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev, + StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override { #ifdef USE_MKL2017DNN if (srcGrad.GetCurrentMatrixLocation() == CPU && - BackwardCoreMKL(in, srcGrad, grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad)) + std::is_same::value && + BackwardCoreMKL(*(const StatMat*)&in, *(const StatMat*)&srcGrad, *(StatMat*)&grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad)) return; #endif if (!accumulateDataGrad) - grad.SetValue((ElemType)0); + grad.SetValue((InoutType)0); srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad); } @@ -147,7 +150,7 @@ class CntkBatchNormEngine : public BatchNormEngine struct MKLScaleShiftAdapter { bool isInput; - std::shared_ptr> mat; + std::shared_ptr> mat; dnnResourceType_t resourceType; size_t numChannels; @@ -155,19 +158,19 @@ class CntkBatchNormEngine : public BatchNormEngine { Clear(); numChannels = n; - mat = std::make_shared>(numChannels, 2, CPUDEVICE); + mat = std::make_shared>(numChannels, 2, CPUDEVICE); isInput = userToPrim; resourceType = rt; } void PrepareForExecution(void* scale, void* bias, void* resources[dnnResourceNumber]) { - ElemType* buffer = mat->Data(); + StatType* buffer = mat->Data(); resources[resourceType] = buffer; if (isInput) { - memcpy(buffer, scale, sizeof(ElemType) * numChannels); - memcpy(buffer + numChannels, bias, sizeof(ElemType) * numChannels); + memcpy(buffer, scale, sizeof(StatType) * numChannels); + memcpy(buffer + numChannels, bias, sizeof(StatType) * numChannels); } } @@ -176,9 +179,9 @@ class CntkBatchNormEngine : public BatchNormEngine if (isInput) RuntimeError("Cannot execute output ResourceAdapter for input"); - ElemType* buffer = mat->Data(); - memcpy(scale, buffer, sizeof(ElemType) * numChannels); - memcpy(bias, buffer + numChannels, sizeof(ElemType) * numChannels); + StatType* buffer = mat->Data(); + memcpy(scale, buffer, sizeof(StatType) * numChannels); + memcpy(bias, buffer + numChannels, sizeof(StatType) * numChannels); } void Clear() @@ -194,21 +197,21 @@ class CntkBatchNormEngine : public BatchNormEngine struct PrimitiveContext { - MKLDnnResourceAdapter input; - MKLDnnResourceAdapter output; + MKLDnnResourceAdapter input; + MKLDnnResourceAdapter output; MKLScaleShiftAdapter scaleShift; - std::shared_ptr varianceMat; // variance matrix used for converting InvStdDev + std::shared_ptr varianceMat; // variance matrix used for converting InvStdDev dnnPrimitive_t primitive = nullptr; dnnPrimitiveAttributes_t attributes = nullptr; void Clear() { - if (primitive) { dnnDelete(primitive); primitive = nullptr; } + if (primitive) { dnnDelete(primitive); primitive = nullptr; } input.Clear(); scaleShift.Clear(); output.Clear(); - if (attributes) { dnnPrimitiveAttributesDestroy(attributes); attributes = nullptr; } + if (attributes) { dnnPrimitiveAttributesDestroy(attributes); attributes = nullptr; } } ~PrimitiveContext() @@ -219,7 +222,7 @@ class CntkBatchNormEngine : public BatchNormEngine TensorShape m_shape; size_t m_numSamples; - ElemType m_epsilon; + StatType m_epsilon; public: MKLBatchNormalizationContext() : @@ -233,12 +236,12 @@ class CntkBatchNormEngine : public BatchNormEngine return !!(m_contextFlags & (1 << contextIndex)); } - void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, ElemType epsilon = 0) + void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, StatType epsilon = 0) { int flag = (1 << contextIndex); if (contextIndex == ContextIndex_Backward) { - epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (ElemType)DEFAULT_EPSILON; + epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (StatType)DEFAULT_EPSILON; } bool same = (shape == m_shape) && (numSamples == m_numSamples) && (epsilon == m_epsilon); @@ -285,10 +288,10 @@ class CntkBatchNormEngine : public BatchNormEngine { case ContextIndex_ForwardInfer: case ContextIndex_ForwardTrain: - CHECK_MKL(dnnLayoutCreate(<UserInput, inoutDim, inoutSizes, inoutStrides)); - CHECK_MKL(dnnLayoutCreate(<UserOutput, inoutDim, inoutSizes, inoutStrides)); - CHECK_MKL(dnnPrimitiveAttributesCreate(&ctx.attributes)); - CHECK_MKL(dnnBatchNormalizationCreateForward_v2( + CHECK_MKL(dnnLayoutCreate(<UserInput, inoutDim, inoutSizes, inoutStrides)); + CHECK_MKL(dnnLayoutCreate(<UserOutput, inoutDim, inoutSizes, inoutStrides)); + CHECK_MKL(dnnPrimitiveAttributesCreate(&ctx.attributes)); + CHECK_MKL(dnnBatchNormalizationCreateForward_v2( &ctx.primitive, ctx.attributes, ltUserInput, @@ -299,10 +302,10 @@ class CntkBatchNormEngine : public BatchNormEngine scaleShiftType = dnnResourceScaleShift; break; case ContextIndex_Backward: - CHECK_MKL(dnnLayoutCreate(<UserInput, inoutDim, inoutSizes, inoutStrides)); - CHECK_MKL(dnnLayoutCreate(<UserOutput, inoutDim, inoutSizes, inoutStrides)); - CHECK_MKL(dnnPrimitiveAttributesCreate(&ctx.attributes)); - CHECK_MKL(dnnBatchNormalizationCreateBackward_v2( + CHECK_MKL(dnnLayoutCreate(<UserInput, inoutDim, inoutSizes, inoutStrides)); + CHECK_MKL(dnnLayoutCreate(<UserOutput, inoutDim, inoutSizes, inoutStrides)); + CHECK_MKL(dnnPrimitiveAttributesCreate(&ctx.attributes)); + CHECK_MKL(dnnBatchNormalizationCreateBackward_v2( &ctx.primitive, ctx.attributes, ltUserInput, @@ -311,16 +314,16 @@ class CntkBatchNormEngine : public BatchNormEngine inputType = dnnResourceDiffDst; outputType = dnnResourceDiffSrc; scaleShiftType = dnnResourceDiffScaleShift; - ctx.varianceMat = std::make_shared(numChannels, 1, CPUDEVICE); + ctx.varianceMat = std::make_shared(numChannels, 1, CPUDEVICE); break; default: RuntimeError("Unexpected context type %d", (int)contextIndex); } - CHECK_MKL(dnnLayoutCreateFromPrimitive(<PrimInput, ctx.primitive, inputType)); + CHECK_MKL(dnnLayoutCreateFromPrimitive(<PrimInput, ctx.primitive, inputType)); ctx.input.Create(ltUserInput, ltPrimInput, inputType, true); - CHECK_MKL(dnnLayoutCreateFromPrimitive(<PrimOutput, ctx.primitive, outputType)); + CHECK_MKL(dnnLayoutCreateFromPrimitive(<PrimOutput, ctx.primitive, outputType)); ctx.output.Create(ltUserOutput, ltPrimOutput, outputType, false); ctx.scaleShift.Create(scaleShiftType, contextIndex != ContextIndex_Backward, numChannels); @@ -338,7 +341,7 @@ class CntkBatchNormEngine : public BatchNormEngine resources[dnnResourceMean] = runMean; resources[dnnResourceVariance] = runVariance; - CHECK_MKL(dnnExecute(ctx.primitive, resources)); + CHECK_MKL(dnnExecute(ctx.primitive, resources)); ctx.output.ConvertOutput(output); } @@ -352,16 +355,16 @@ class CntkBatchNormEngine : public BatchNormEngine ctx.output.PrepareForExecution(grad, resources); ctx.scaleShift.PrepareForExecution(scaleGrad, biasGrad, resources); - std::shared_ptr scaleShiftMat; - scaleShiftMat = std::make_shared(ctx.scaleShift.numChannels, 2, CPUDEVICE); - memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(ElemType)); + std::shared_ptr scaleShiftMat; + scaleShiftMat = std::make_shared(ctx.scaleShift.numChannels, 2, CPUDEVICE); + memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(StatType)); resources[dnnResourceScaleShift] = scaleShiftMat->Data(); // convert from InvStdDev to variance for (size_t i = 0; i < ctx.scaleShift.numChannels; i++) { - ElemType& v = ctx.varianceMat->Data()[i]; - ElemType& s = ((ElemType*)savedInvStdDev)[i]; + StatType& v = ctx.varianceMat->Data()[i]; + StatType& s = ((StatType*)savedInvStdDev)[i]; v = (1 / (s * s) - m_epsilon); } @@ -369,7 +372,7 @@ class CntkBatchNormEngine : public BatchNormEngine resources[dnnResourceMean] = savedMean; resources[dnnResourceVariance] = ctx.varianceMat->Data(); - CHECK_MKL(dnnExecute(ctx.primitive, resources)); + CHECK_MKL(dnnExecute(ctx.primitive, resources)); ctx.output.ConvertOutput(grad); ctx.scaleShift.ConvertOutput(scaleGrad, biasGrad); @@ -377,15 +380,15 @@ class CntkBatchNormEngine : public BatchNormEngine }; MKLBatchNormalizationContext m_mklContext; - std::shared_ptr m_dataGradWorkspace; + std::shared_ptr m_dataGradWorkspace; - bool ForwardCoreMKL(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) + bool ForwardCoreMKL(const StatMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, StatMat& runMean, StatMat& runVariance, + StatMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) { ContextIndex contextIndex = inferenceOnly ? ContextIndex_ForwardInfer : ContextIndex_ForwardTrain; - m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (ElemType)epsilon); + m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (StatType)epsilon); if (inferenceOnly) { @@ -398,33 +401,33 @@ class CntkBatchNormEngine : public BatchNormEngine m_mklContext.Forward(in.Data(), out.Data(), scale.Data(), bias.Data(), savedMean.Data(), savedInvStdDev.Data(), contextIndex); // update savedMean, savedInvStdDev - ElemType OneMinusExpAvgFactor = (ElemType)(1.0 - expAvgFactor); - cblas_axpby((MKL_INT)runMean.GetNumElements(), (ElemType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data()); + StatType OneMinusExpAvgFactor = (StatType)(1.0 - expAvgFactor); + cblas_axpby((MKL_INT)runMean.GetNumElements(), (StatType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data()); // note savedInvStdDev currently hold variance of in.Data(), need to convert to InvStdDev and interpolate - ElemType numReduced = (ElemType)(in.GetNumElements() / runVariance.GetNumElements()); - ElemType bcf = numReduced / (numReduced - 1); + StatType numReduced = (StatType)(in.GetNumElements() / runVariance.GetNumElements()); + StatType bcf = numReduced / (numReduced - 1); for (size_t i = 0; i < runVariance.GetNumElements(); i++) { - ElemType& v = runVariance.Data()[i]; - ElemType& s = savedInvStdDev.Data()[i]; - v = v * OneMinusExpAvgFactor + bcf * s * (ElemType)expAvgFactor; - s = (ElemType)1 / sqrt(s + (ElemType)epsilon); + StatType& v = runVariance.Data()[i]; + StatType& s = savedInvStdDev.Data()[i]; + v = v * OneMinusExpAvgFactor + bcf * s * (StatType)expAvgFactor; + s = (StatType)1 / sqrt(s + (StatType)epsilon); } } return true; } - bool BackwardCoreMKL(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, - const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) + bool BackwardCoreMKL(const StatMat& in, const StatMat& srcGrad, StatMat& grad, const StatMat& scale, + const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) { m_mklContext.Prepare(m_inOutT, m_spatial, srcGrad.GetNumCols(), ContextIndex_Backward); if (accumulateDataGrad) { if (!m_dataGradWorkspace) - m_dataGradWorkspace = std::make_shared>(0, 0, CPUDEVICE); + m_dataGradWorkspace = std::make_shared>(0, 0, CPUDEVICE); m_dataGradWorkspace->SetValue(grad); } @@ -432,23 +435,24 @@ class CntkBatchNormEngine : public BatchNormEngine m_mklContext.Backward(in.Data(), srcGrad.Data(), grad.Data(), scale.Data(), savedMean.Data(), savedInvStdDev.Data(), scaleGrad.Data(), biasGrad.Data()); if (accumulateDataGrad) - cblas_axpby((MKL_INT)grad.GetNumElements(), (ElemType)1.0, m_dataGradWorkspace->Data(), (ElemType)1.0, grad.Data()); + cblas_axpby((MKL_INT)grad.GetNumElements(), (StatType)1.0, m_dataGradWorkspace->Data(), (StatType)1.0, grad.Data()); return true; } #endif }; -template class CntkBatchNormEngine; -template class CntkBatchNormEngine; +template class CntkBatchNormEngine; +template class CntkBatchNormEngine; +template class CntkBatchNormEngine; template bool HasFlag(T src, T testFlag) { return ((int)src & (int)testFlag) != 0; } -template -std::unique_ptr> BatchNormEngine::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT, +template +std::unique_ptr> BatchNormEngine::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT, bool spatial, ImageLayoutKind imageLayout, BatchNormEngineKind enabledEngines) { @@ -458,7 +462,7 @@ std::unique_ptr> BatchNormEngine::Create(DEV if (GetMathLibTraceLevel() > 0) fprintf(stderr, "Using CNTK batch normalization engine.\n"); - return std::make_unique>(deviceId, inOutT, spatial, imageLayout); + return std::make_unique>(deviceId, inOutT, spatial, imageLayout); } if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn)) @@ -466,13 +470,14 @@ std::unique_ptr> BatchNormEngine::Create(DEV if (GetMathLibTraceLevel() > 0) fprintf(stderr, "Using cuDNN batch normalization engine.\n"); - return CuDnnBatchNormEngineFactory::Create(deviceId, inOutT, spatial, imageLayout); + return CuDnnBatchNormEngineFactory::Create(deviceId, inOutT, spatial, imageLayout); } RuntimeError("Could not find appropriate batch normalization engine."); } -template class BatchNormEngine; -template class BatchNormEngine; +template class BatchNormEngine; +template class BatchNormEngine; +template class BatchNormEngine; }}} diff --git a/Source/Math/BatchNormalizationEngine.h b/Source/Math/BatchNormalizationEngine.h index b6343dc46aa..2185713a6ef 100644 --- a/Source/Math/BatchNormalizationEngine.h +++ b/Source/Math/BatchNormalizationEngine.h @@ -25,22 +25,23 @@ enum class BatchNormEngineKind #pragma warning(push) #pragma warning(disable : 4251) -template +template class MATH_API BatchNormEngine { public: - using Mat = Matrix; + using InoutMat = Matrix; + using StatMat = Matrix; public: virtual ~BatchNormEngine() {}; - void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev); + void Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance, + InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev); - void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev, - Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad); + void Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev, + StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad); - static std::unique_ptr> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT, + static std::unique_ptr> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT, bool spatial, ImageLayoutKind imageLayout, BatchNormEngineKind enabledEngines = BatchNormEngineKind::All); @@ -56,11 +57,11 @@ class MATH_API BatchNormEngine virtual void EnsureCompatible() = 0; // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched - virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0; + virtual void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance, + InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev) = 0; - virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev, - Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) = 0; + virtual void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev, + StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) = 0; protected: DEVICEID_TYPE m_deviceId; diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 86f4ecd46ca..e0ae9efa698 100755 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -14,6 +14,7 @@ #include #include #include "QuantizedOperations.h" +#include "half.hpp" //#include "GPUMatrix.h" //#include "CPUSparseMatrix.h" @@ -108,7 +109,9 @@ class MATH_API CPUMatrix : public BaseMatrix const bool needAveMultiplier, const bool initialized); - void AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon); + template + void AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon); + void AdaDeltaFlushTimestamps(size_t cols, ElemType rho, int* timestamps, int currentTimestamp); void Reshape(const size_t numRows, const size_t numCols); @@ -417,16 +420,26 @@ class MATH_API CPUMatrix : public BaseMatrix void AveragePoolingBackward(const CPUMatrix& mpRowCol, const CPUMatrix& mpRowIndices, const CPUMatrix& indices, CPUMatrix& grad, const bool poolIncludePad, bool accumulateGradient) const; - void BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, - CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const; - void BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, - CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const; + template + void BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, + CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const; + + template + void BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, + CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const; public: // This functions do not depend on , i.e. you can call them on any static int SetNumThreads(int numThreads); static int GetMaxNumThreads(); + enum OptimizationFlag + { + OPT_EVAL_WITH_MKL = 1, // using Intel MKL functions for evaluation performance + }; + static void SetOptimizationFlags(int flags); + static int GetOptimizationFlags(); + static void SetCompatibleMode(); // static BLAS functions @@ -456,6 +469,7 @@ class MATH_API CPUMatrix : public BaseMatrix static void InnerProduct(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, const bool isColWise); static ElemType InnerProductOfMatrices(const CPUMatrix& a, const CPUMatrix& b); static void ElementWisePower(ElemType alpha, const CPUMatrix& a, CPUMatrix& c); + static void BatchMatMul(ElemType beta, const CPUMatrix& a, const bool transposeA, const int m, const CPUMatrix& b, const bool transposeB, const int n, CPUMatrix& c, const bool isColWise); static bool AreEqual(const CPUMatrix& a, const CPUMatrix& b, const ElemType threshold = 1e-8); @@ -572,9 +586,37 @@ class MATH_API CPUMatrix : public BaseMatrix void Clear(); void ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1); + +private: + static int m_optimizationFlags; }; typedef CPUMatrix CPUSingleMatrix; typedef CPUMatrix CPUDoubleMatrix; +typedef CPUMatrix CPUHalfMatrix; + +template +void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix& a, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 4>& regularStrides, + const SmallVector& reducingOpDims, const array, 4>& reducingStrides); + +template +void CPUMatrixTensorArgOpImpl(const CPUMatrix& a, CPUMatrix& o, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); }}} diff --git a/Source/Math/CPUMatrixDouble.cpp b/Source/Math/CPUMatrixDouble.cpp index a42c75426c7..9e8a0c7d3e2 100644 --- a/Source/Math/CPUMatrixDouble.cpp +++ b/Source/Math/CPUMatrixDouble.cpp @@ -9,4 +9,5 @@ namespace Microsoft { namespace MSR { namespace CNTK { // explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash. template class MATH_API CPUMatrix; + template<> int CPUMatrix::m_optimizationFlags = 0; }}} \ No newline at end of file diff --git a/Source/Math/CPUMatrixFloat.cpp b/Source/Math/CPUMatrixFloat.cpp index 574955b06e4..3fba148bed4 100644 --- a/Source/Math/CPUMatrixFloat.cpp +++ b/Source/Math/CPUMatrixFloat.cpp @@ -21,4 +21,5 @@ namespace Microsoft { namespace MSR { namespace CNTK { // explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash. template class MATH_API CPUMatrix; + template<> int CPUMatrix::m_optimizationFlags = CPUMatrix::OPT_EVAL_WITH_MKL; // enable eval MKL optimization by default }}} \ No newline at end of file diff --git a/Source/Math/CPUMatrixHalf.cpp b/Source/Math/CPUMatrixHalf.cpp new file mode 100644 index 00000000000..bd173ffe1e4 --- /dev/null +++ b/Source/Math/CPUMatrixHalf.cpp @@ -0,0 +1,133 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// +#include "stdafx.h" +#include "CPUMatrixImpl.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +// General conversion function with no performance optimization +// this should only be used in CPU half precision +// For performance on inference on CPU, user should convert fp16 model to fp32 first, unless MKL supports half precision +template +static void ConvertBuffer(DstT* dst, const SrcT* src, size_t count) +{ + for (size_t i = 0; i < count; i++) + { + dst[i] = (DstT)src[i]; + } +} + +// specialization to convert from half to float for computation, and then store in half +template <> +void CPUMatrix::MultiplyAndWeightedAdd(half alpha, const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, + half beta, CPUMatrix& c, shared_ptr> pQuantizedMultiplier) +{ + CPUMatrix af(a.GetNumRows(), a.GetNumCols()); + CPUMatrix bf(b.GetNumRows(), b.GetNumCols()); + CPUMatrix cf(c.GetNumRows(), c.GetNumCols()); + + if (alpha != 0) + { + ConvertBuffer(af.Data(), a.Data(), a.GetNumElements()); + ConvertBuffer(bf.Data(), b.Data(), b.GetNumElements()); + } + + if (beta != 0) + { + ConvertBuffer(cf.Data(), c.Data(), c.GetNumElements()); + } + + if (pQuantizedMultiplier) + RuntimeError("Quantized matrix multiply not supported for Half"); + + CPUMatrix::MultiplyAndWeightedAdd((float)alpha, af, transposeA, bf, transposeB, (float)beta, cf, nullptr); + + ConvertBuffer(c.Data(), cf.Data(), c.GetNumElements()); +} + +// specialization to RunTimeError for now due to omp implementation only support build-in type +template <> +void CPUMatrix::AssignSoftmaxSum(const CPUMatrix& softmax, CPUMatrix& c) +{ + RuntimeError("half AssignSoftmaxSum not supported."); +} + +template <> +void CPUMatrix::AssignNCEUnnormalizedEval(const CPUMatrix& a, + const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& c) +{ + RuntimeError("half AssignNCEUnnormalizedEval not supported."); +} + +template <> +void CPUMatrix::VectorSum(const CPUMatrix& a, CPUMatrix& c, const bool isColWise) +{ + RuntimeError("half VectorSum not supported."); +} + +template <> +void CPUMatrix::VectorNorm1(CPUMatrix& c, const bool isColWise) const +{ + RuntimeError("half VectorNorm1 not supported."); +} + +template <> +half CPUMatrix::SumOfElements() const +{ + RuntimeError("half SumOfElements not supported."); +} + +template <> +half CPUMatrix::MatrixNorm1() const +{ + RuntimeError("half MatrixNorm1 not supported."); +} + +template <> + half CPUMatrix::FrobeniusNorm() const +{ + RuntimeError("half FrobeniusNorm not supported."); +} + +template <> +void CPUMatrix::MaxPoolingBackward(const CPUMatrix& out, const CPUMatrix& in, + const CPUMatrix& mpRowCol, const CPUMatrix& mpRowIndices, const CPUMatrix& indices, + CPUMatrix& grad, bool accumulateGradient) const +{ + RuntimeError("half MaxPoolingBackward not supported."); +} + +template <> +void CPUMatrix::MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height, + const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix& roiData, CPUMatrix& grad, + CPUMatrix& argmax, double spatialScale) const +{ + RuntimeError("half MaxROIPoolingBackward not supported."); +} + +template <> +void CPUMatrix::AveragePoolingBackward(const CPUMatrix& mpRowCol, const CPUMatrix& mpRowIndices, const CPUMatrix& indices, CPUMatrix& grad, const bool poolIncludePad, bool accumulateGradient) const +{ + RuntimeError("half AveragePoolingBackward not supported."); +} + +// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash. +template class MATH_API CPUMatrix; +template<> int CPUMatrix::m_optimizationFlags = 0; + +// instantiate templated methods +template void CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, float learningRate, float rho, float epsilon); +template void CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, double learningRate, double rho, double epsilon); +template void CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, float learningRate, float rho, float epsilon); + +template void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const; +template void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const; +template void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const; + +template void CPUMatrix::BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const; +template void CPUMatrix::BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const; +template void CPUMatrix::BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const; + +}}} diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h index a29537d433d..83819cf021e 100644 --- a/Source/Math/CPUMatrixImpl.h +++ b/Source/Math/CPUMatrixImpl.h @@ -114,10 +114,10 @@ static ElemType* NewArray(size_t n) { // We need to allocate possibly one more element for the following reason. // At some point we might want to fill a buffer with the result of a random - // number generator. The RNG is oblivious to whether the buffer is on the + // number generator. The RNG is oblivious to whether the buffer is on the // CPU or GPU but it needs to keep an accurate tally of how many numbers it - // has generated. The trouble stems from the fact that generating an odd - // number gaussians on the GPU is not supported so we must always + // has generated. The trouble stems from the fact that generating an odd + // number gaussians on the GPU is not supported so we must always // generate an even number. So since we wouldn't know how to update the tally // we are making this allocate one more element in the worst case. ElemType* p = new ElemType[AsMultipleOf(n, 2)](); @@ -897,7 +897,7 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E { ElemType* bufPtr = Data(); auto& us = *this; - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_column (j, us) @@ -905,7 +905,7 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E cblas_dcopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(bufPtr + LocateColumn(j)), 1); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_column (j, us) @@ -916,6 +916,10 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E } } } + else + { + RuntimeError("Unsupported data format"); + } } } } @@ -1011,22 +1015,22 @@ void CPUMatrix::SetUniformRandomValue(const ElemType low, const ElemTy std::mt19937_64 generator; generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); - boost::random::uniform_real_distribution r(low, high); + boost::random::uniform_real_distribution r((double)low, (double)high); ElemType* bufPtr = Data(); long m = (long) GetNumElements(); // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { - bufPtr[i] = r(generator); - bufPtr[i + 1] = r(generator); - bufPtr[i + 2] = r(generator); - bufPtr[i + 3] = r(generator); + bufPtr[i] = (ElemType)r(generator); + bufPtr[i + 1] = (ElemType)r(generator); + bufPtr[i + 2] = (ElemType)r(generator); + bufPtr[i + 3] = (ElemType)r(generator); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { - bufPtr[i] = r(generator); + bufPtr[i] = (ElemType)r(generator); } } @@ -1041,8 +1045,8 @@ void CPUMatrix::SetUniformRandomValue(RNGHandle& rngHandle, const Elem if (cpuRNGHandle == nullptr) LogicError("rngHandle must be a CPURNGHandle."); - boost::random::uniform_real_distribution r(low, high); - std::generate(Data(), Data() + GetNumElements(), [&cpuRNGHandle, &r]() {return r(cpuRNGHandle->Generator()); }); + boost::random::uniform_real_distribution r((double)low, (double)high); + std::generate(Data(), Data() + GetNumElements(), [&cpuRNGHandle, &r]() {return (ElemType)r(cpuRNGHandle->Generator()); }); } template @@ -1055,9 +1059,9 @@ void CPUMatrix::SetGaussianRandomValue(RNGHandle& rngHandle, const Ele if (cpuRNGHandle == nullptr) LogicError("rngHandle must be a CPURNGHandle."); - boost::random::normal_distribution r(mean, stdev); + boost::random::normal_distribution r((double)mean, (double)stdev); auto n = AsMultipleOf(GetNumElements(), 2); - std::generate(Data(), Data() + n, [&cpuRNGHandle, &r]() {return r(cpuRNGHandle->Generator()); }); + std::generate(Data(), Data() + n, [&cpuRNGHandle, &r]() {return (ElemType)r(cpuRNGHandle->Generator()); }); } template @@ -1070,8 +1074,8 @@ void CPUMatrix::SetGumbelRandomValue(RNGHandle& rngHandle, const ElemT if (cpuRNGHandle == nullptr) LogicError("rngHandle must be a CPURNGHandle."); - boost::random::uniform_real_distribution r(0, 1); - std::generate(Data(), Data() + GetNumElements(), [&cpuRNGHandle, &r, loc, scale]() {return loc - scale * log(-log1p(-r(cpuRNGHandle->Generator()))); }); + boost::random::uniform_real_distribution r(0, 1); + std::generate(Data(), Data() + GetNumElements(), [&cpuRNGHandle, &r, loc, scale]() {return (ElemType)(loc - scale * log(-log1p(-r(cpuRNGHandle->Generator())))); }); } @@ -1087,12 +1091,12 @@ void CPUMatrix::SetGaussianRandomValue(const ElemType mean, const Elem auto& us = *this; std::mt19937_64 generator(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); - boost::random::normal_distribution r(mean, sigma); + boost::random::normal_distribution r((double)mean, (double)sigma); // #pragma omp parallel for is not thread safe. Also the results would not be deterministic foreach_coord (i, j, us) { - us(i, j) = r(generator); + us(i, j) = (ElemType)r(generator); } } @@ -1108,7 +1112,7 @@ void CPUMatrix::SetTruncatedNormalRandomValue(const ElemType mean, con auto& us = *this; std::mt19937_64 generator(seed == USE_TIME_BASED_SEED ? (unsigned long)time(NULL) : seed); - boost::random::normal_distribution r(mean, sigma); + boost::random::normal_distribution r((double)mean, (double)sigma); const ElemType high = mean + 2 * sigma; const ElemType low = mean - 2 * sigma; @@ -1117,8 +1121,8 @@ void CPUMatrix::SetTruncatedNormalRandomValue(const ElemType mean, con { ElemType tmp = 0; do - tmp = r(generator); - while (tmp < low || tmp > high ); // Rejection sampling is fine here because the acceptance probability is about 0.9545 + tmp = (ElemType)r(generator); + while (tmp < low || tmp > high ); // Rejection sampling is fine here because the acceptance probability is about 0.9545 us(i, j) = tmp; } } @@ -1136,7 +1140,7 @@ void CPUMatrix::AddGaussianRandomValue(const ElemType mean, const Elem std::mt19937_64 generator; generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); - boost::random::normal_distribution r(mean, sigma); + boost::random::normal_distribution r((double)mean, (double)sigma); long m = (long) GetNumRows(), n = (long) GetNumCols(); for (long j = 0; j < n; j++) @@ -1144,10 +1148,10 @@ void CPUMatrix::AddGaussianRandomValue(const ElemType mean, const Elem // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { - us(i, j) = r(generator); - us(i + 1, j) = r(generator); - us(i + 2, j) = r(generator); - us(i + 3, j) = r(generator); + us(i, j) = (ElemType)r(generator); + us(i + 1, j) = (ElemType)r(generator); + us(i + 2, j) = (ElemType)r(generator); + us(i + 3, j) = (ElemType)r(generator); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) @@ -1170,7 +1174,7 @@ void CPUMatrix::SetUniformRandomMask(const ElemType maskRate, const El LogicError("rngHandle must be a CPURNGHandle."); auto& us = *this; - boost::random::uniform_real_distribution r(0, 1); + boost::random::uniform_real_distribution r(0, 1); long m = (long) GetNumRows(), n = (long) GetNumCols(); ElemType v; for (long j = 0; j < n; j++) @@ -1178,20 +1182,20 @@ void CPUMatrix::SetUniformRandomMask(const ElemType maskRate, const El // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { + v = (ElemType)r(cpuRNGHandle->Generator()); + us(i, j) = v <= maskRate ? (ElemType)0 : scaleValue; v = r(cpuRNGHandle->Generator()); - us(i, j) = v <= maskRate ? 0 : scaleValue; + us(i + 1, j) = v <= maskRate ? (ElemType)0 : scaleValue; v = r(cpuRNGHandle->Generator()); - us(i + 1, j) = v <= maskRate ? 0 : scaleValue; + us(i + 2, j) = v <= maskRate ? (ElemType)0 : scaleValue; v = r(cpuRNGHandle->Generator()); - us(i + 2, j) = v <= maskRate ? 0 : scaleValue; - v = r(cpuRNGHandle->Generator()); - us(i + 3, j) = v <= maskRate ? 0 : scaleValue; + us(i + 3, j) = v <= maskRate ? (ElemType)0 : scaleValue; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { - v = r(cpuRNGHandle->Generator()); - us(i, j) = v <= maskRate ? 0 : scaleValue; + v = (ElemType)r(cpuRNGHandle->Generator()); + us(i, j) = v <= maskRate ? (ElemType)0 : scaleValue; } } } @@ -1347,7 +1351,7 @@ void CPUMatrix::Adam(CPUMatrix& gradients, CPUMatrix::RmsProp(CPUMatrix& gradients, } template -void CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon) +template +void CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon) { size_t numColsNeeded = 2 * gradients.GetNumCols(); @@ -1469,7 +1474,7 @@ void CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix::AdaDelta(CPUMatrix& gradients, CPUMatrix::AdaDeltaFlushTimestamps(size_t cols, ElemType rho, int #pragma omp parallel for for (auto col = 0; col < cols; ++col) { - auto decay = std::pow(rho, ElemType(currentTimestamp - timestamps[col])); + ElemType decay = std::pow(rho, ElemType(currentTimestamp - timestamps[col])); auto offset = rows * col; timestamps[col] = 0; for (auto row = 0; row < rows; ++row) @@ -2504,9 +2509,13 @@ CPUMatrix& CPUMatrix::AssignHardmaxOf(const CPUMatrix& CPUMatrix::AssignHardmaxOf(const CPUMatrix& CPUMatrix::AssignHardmaxOf(const CPUMatrix::SumOfAbsElements() const if (IsEmpty()) LogicError("SumOfAbsElements: Matrix is empty."); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { return (ElemType) cblas_dasum((int) GetNumElements(), reinterpret_cast(Data()), 1); } - else + else if (std::is_same::value) { #pragma warning(suppress : 4244) return cblas_sasum((int) GetNumElements(), reinterpret_cast(Data()), 1); } + else + { + RuntimeError("Unsupported data format"); + } } //sum of all elements @@ -3279,7 +3298,7 @@ CPUMatrix& CPUMatrix::ScatterToIndices(const CPUMatrixGetNumCols()); return *this; @@ -3408,7 +3427,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi { c.RequireSize(1, n); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_column (j, c) @@ -3416,7 +3435,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi c(0, j) = (ElemType) cblas_dnrm2(m, reinterpret_cast(bufPtr + us.LocateColumn(j)), 1); } } - else + else if(std::is_same::value) { #pragma omp parallel for foreach_column (j, c) @@ -3425,12 +3444,16 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi c(0, j) = cblas_snrm2(m, reinterpret_cast(bufPtr + us.LocateColumn(j)), 1); } } + else + { + RuntimeError("Unsupported data format"); + } } else { c.RequireSize(m, 1); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -3438,7 +3461,7 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi c(i, 0) = cblas_dnrm2(n, reinterpret_cast(bufPtr + i), m); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -3447,6 +3470,10 @@ void CPUMatrix::VectorNorm2(CPUMatrix& c, const bool isColWi c(i, 0) = cblas_snrm2(n, reinterpret_cast(bufPtr + i), m); } } + else + { + RuntimeError("Unsupported data format"); + } } } @@ -3480,7 +3507,7 @@ void CPUMatrix::VectorNormInf(CPUMatrix& c, const bool isCol ElemType v = 0; foreach_row (i, us) { - v = std::max(v, abs(us(i, j))); + v = std::max(v, fabs_(us(i, j))); } c(0, j) = v; } @@ -3495,7 +3522,7 @@ void CPUMatrix::VectorNormInf(CPUMatrix& c, const bool isCol ElemType v = 0; foreach_column (j, us) { - v = std::max(v, abs(us(i, j))); + v = std::max(v, fabs_(us(i, j))); } c(i, 0) = v; } @@ -3688,7 +3715,7 @@ ElemType CPUMatrix::MatrixNormInf() const { #pragma omp critical { - v = std::max(v, abs(us(i, j))); + v = std::max(v, fabs_(us(i, j))); } } return v; @@ -4057,7 +4084,7 @@ void CPUMatrix::Print(const char* matrixName, ptrdiff_t rowFirst, ptrd fprintf(stderr, "...\t"); j = colRange.skipEnd; } - fprintf(stderr, "%.10f\t", us(i, j)); + fprintf(stderr, "%.10f\t", (double)us(i, j)); } if (colRange.end < GetNumCols()) // ... at line end fprintf(stderr, "..."); @@ -4683,9 +4710,9 @@ void CPUMatrix::MaxPoolingBackward(const CPUMatrix& out, con assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows()); if (in(colBase + dcol, sample) >= m) { -#pragma omp atomic +#pragma omp atomic grad(colBase + dcol, sample) += g; - break; + break; } } } @@ -4699,14 +4726,14 @@ void CPUMatrix::MaxPoolingBackward(const CPUMatrix& out, con // corresponding to the ROI and which pixels in that subset should go into the // output location, then takes the max value over that window. // src: Images [W x H x C x N] -// roiData: ROIs [4 x numROIs x N], +// roiData: ROIs [4 x numROIs x N], // dst: Pooled ROIs [PW x PH x C x numROIs x N] // argmax: max positions [PW x PH x C x numROIs x N] // spatialScale ratio of input feature map to the original image. // where PW = Pooled Width, PH = Pooled Height, C = Channels, N = Batch Size template void CPUMatrix::MaxROIPoolingForward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height, - const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix& roiData, CPUMatrix& output, + const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix& roiData, CPUMatrix& output, CPUMatrix& argmax, double spatialScale) const { size_t roiOutputSize = pooledHeight * pooledWidth * channels; @@ -4771,7 +4798,7 @@ void CPUMatrix::MaxROIPoolingForward(const size_t numRois, const size_ // [W x H x C x R x N]; R = ROIs per image size_t outputIdx = roiIdx * roiOutputSize + outw + outh * pooledWidth + c * pooledHeight * pooledWidth; size_t maxidx = 0; - ElemType maxval = isempty ? (ElemType)0 : -FLT_MAX; + ElemType maxval = isempty ? (ElemType)0 : (ElemType)-FLT_MAX; size_t baseIdx = c * height * width; for (size_t h = hstart; h < hend; h++) @@ -4802,7 +4829,7 @@ void CPUMatrix::MaxROIPoolingForward(const size_t numRois, const size_ // this pixel location as the maximum. If so, it increments the gradient term for the input location. template void CPUMatrix::MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height, - const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix& roiData, CPUMatrix& grad, + const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix& roiData, CPUMatrix& grad, CPUMatrix& argmax, double spatialScale) const { // loop over images in the batch. @@ -4989,7 +5016,7 @@ void CPUMatrix::AveragePoolingBackward(const CPUMatrix& mpRowCol, { int dcol = indices(i0 + i, 0); assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows()); -#pragma omp atomic +#pragma omp atomic grad(colBase + dcol, sample) += g; } } @@ -4997,9 +5024,10 @@ void CPUMatrix::AveragePoolingBackward(const CPUMatrix& mpRowCol, } template -void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, - CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, - CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const +template +void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, + CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, + CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const { if (GetNumRows() % scale.GetNumRows() != 0) LogicError("The number of rows of this matrx must be multiple of the number of rows of the scale matrix."); @@ -5021,7 +5049,7 @@ void CPUMatrix::BatchNormalizationForward(const CPUMatrix& s { size_t imap = irow / spatialSize; ElemType stdDev = sqrt(runVariance(imap, 0) + epsilon); - out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0); + out(irow, icol) = (ElemType)(scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0)); } } } @@ -5033,16 +5061,17 @@ void CPUMatrix::BatchNormalizationForward(const CPUMatrix& s for (long irow = 0; irow < out.GetNumRows(); irow++) { ElemType stdDev = sqrt(runVariance(irow, 0) + epsilon); - out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0); + out(irow, icol) = (ElemType)(scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0)); } } } } template -void CPUMatrix::BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, - const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, - CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const +template +void CPUMatrix::BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, + const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, + CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const { UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(blendFactor), UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad); RuntimeError("Batch normalization training on CPU is not yet implemented."); @@ -5114,15 +5143,19 @@ void CPUMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix if (pQuantizedMultiplier == nullptr) { - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { cblas_dgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast(a.Data()), lda, reinterpret_cast(b.Data()), ldb, beta, reinterpret_cast(c.Data()), ldc); } - else + else if (std::is_same::value) { #pragma warning(suppress : 4244) cblas_sgemm((CBLAS_ORDER) (int)MatrixOrder::ColMajor, mklTransA, mklTransB, m, n, k, alpha, reinterpret_cast(a.Data()), lda, reinterpret_cast(b.Data()), ldb, beta, reinterpret_cast(c.Data()), ldc); } + else + { + RuntimeError("Unsupported data format"); + } } else { @@ -5200,18 +5233,22 @@ void CPUMatrix::SVD(const CPUMatrix& A, CPUMatrix& #if CNTK_UWP RuntimeError("Error, LAPACKE_*gesvd is not supported for UWP.\n"); #else - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { std::vector superb(std::max(std::min(m, n) - 1, 1)); info = LAPACKE_dgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast(A.Data()), (int) lda, reinterpret_cast(SIGMA.Data()), reinterpret_cast(U.Data()), (int) ldu, reinterpret_cast(VT.Data()), (int) ldvt, &superb[0]); } - else + else if (std::is_same::value) { std::vector superb(std::max(std::min(m, n) - 1, 1)); info = LAPACKE_sgesvd((int) MatrixOrder::ColMajor, 'A', 'A', (int) m, (int) n, reinterpret_cast(A.Data()), (int) lda, reinterpret_cast(SIGMA.Data()), reinterpret_cast(U.Data()), (int) ldu, reinterpret_cast(VT.Data()), (int) ldvt, &superb[0]); } + else + { + RuntimeError("Unsupported data format"); + } #endif if (info > 0) @@ -5314,7 +5351,7 @@ CPUMatrix& CPUMatrix::AssignNCEDerivative(const CPUMatrix::AssignNoiseContrastiveEstimation(const CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& if ((int) c.GetNumRows() != m || (int) c.GetNumCols() != n) InvalidArgument("Dimension of matrix c does not match dimension of matrix a."); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { cblas_daxpy(len, alpha, reinterpret_cast(a.Data()), incx, reinterpret_cast(c.Data()), incy); } - else + else if (std::is_same::value) { #pragma warning(suppress : 4244) cblas_saxpy(len, alpha, reinterpret_cast(a.Data()), incx, reinterpret_cast(c.Data()), incy); } + else + { + RuntimeError("Unsupported data format"); + } } else if (a.GetNumElements() == 1) // scalar, add to all elements { @@ -5444,7 +5485,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& ElemType* aBufPtr = a.Data(); ElemType* cBufPtr = c.Data(); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_column (j, c) @@ -5452,7 +5493,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& cblas_daxpy(m, alpha, reinterpret_cast(aBufPtr), 1, reinterpret_cast(cBufPtr + c.LocateColumn(j)), 1); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_column (j, c) @@ -5461,6 +5502,10 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& cblas_saxpy(m, alpha, reinterpret_cast(aBufPtr), 1, reinterpret_cast(cBufPtr + c.LocateColumn(j)), 1); } } + else + { + RuntimeError("Unsupported data format"); + } } else // row vector, add it to all rows { @@ -5471,7 +5516,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& ElemType* aBufPtr = a.Data(); ElemType* cBufPtr = c.Data(); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -5479,7 +5524,7 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& cblas_daxpy(n, alpha, reinterpret_cast(aBufPtr), 1, reinterpret_cast(cBufPtr + i), m); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -5488,6 +5533,10 @@ void CPUMatrix::ScaleAndAdd(ElemType alpha, const CPUMatrix& cblas_saxpy(n, alpha, reinterpret_cast(aBufPtr), 1, reinterpret_cast(cBufPtr + i), m); } } + else + { + RuntimeError("Unsupported data format"); + } } } /// c += alpha * (a-b) @@ -5576,7 +5625,7 @@ void CPUMatrix::AddElementToElement(ElemType beta, const CPUMatrix= c.GetNumRows() || cj >= c.GetNumCols()) InvalidArgument("AddElementToElement: index out of range."); - ElemType us = beta ? beta * c(ci, cj) : 0; // do not multiply if beta is 0, could be a NaN + ElemType us = beta ? beta * c(ci, cj) : (ElemType)0; // do not multiply if beta is 0, could be a NaN us += a(ai, aj); c(ci, cj) = us; } @@ -5697,15 +5746,19 @@ template { memset(a.Data(), 0, sizeof(ElemType) * len); } - else if (sizeof(ElemType) == sizeof(double)) + else if (std::is_same::value) { cblas_dscal(len, alpha, reinterpret_cast(a.Data()), incx); } - else + else if (std::is_same::value) { #pragma warning(suppress : 4244) cblas_sscal(len, alpha, reinterpret_cast(a.Data()), incx); } + else + { + RuntimeError("Unsupported data format"); + } } /// Matrix multiply with col-major matrices: a = alpha[1,1] * a @@ -5746,7 +5799,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa ElemType* aBufPtr = a.Data(); ElemType* bBufPtr = b.Data(); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_column (j, c) @@ -5754,7 +5807,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa c(0, j) = (ElemType) cblas_ddot(m, reinterpret_cast(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast(bBufPtr + b.LocateColumn(j)), 1); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_column (j, c) @@ -5763,6 +5816,10 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa c(0, j) = (ElemType) cblas_sdot(m, reinterpret_cast(aBufPtr + a.LocateColumn(j)), 1, reinterpret_cast(bBufPtr + b.LocateColumn(j)), 1); } } + else + { + RuntimeError("Unsupported data format"); + } } else { @@ -5770,7 +5827,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa ElemType* aBufPtr = a.Data(); ElemType* bBufPtr = b.Data(); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -5778,7 +5835,7 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa c(i, 0) = cblas_ddot(n, reinterpret_cast(aBufPtr + i), m, reinterpret_cast(bBufPtr + i), m); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -5787,6 +5844,10 @@ void CPUMatrix::InnerProduct(const CPUMatrix& a, const CPUMa c(i, 0) = cblas_sdot(n, reinterpret_cast(aBufPtr + i), m, reinterpret_cast(bBufPtr + i), m); } } + else + { + RuntimeError("Unsupported data format"); + } } } @@ -5806,15 +5867,19 @@ ElemType CPUMatrix::InnerProductOfMatrices(const CPUMatrix& if (m != k || n != l) InvalidArgument("InnerProductOfMatrices: Matrices a and b should have same dimension."); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { return (ElemType) cblas_ddot((int) a.GetNumElements(), reinterpret_cast(a.Data()), 1, reinterpret_cast(b.Data()), 1); } - else + else if (std::is_same::value) { #pragma warning(suppress : 4244) return (ElemType) cblas_sdot((int) a.GetNumElements(), reinterpret_cast(a.Data()), 1, reinterpret_cast(b.Data()), 1); } + else + { + RuntimeError("Unsupported data format"); + } } template @@ -5851,6 +5916,119 @@ void CPUMatrix::ElementWisePower(ElemType alpha, const CPUMatrix +void CPUMatrix::BatchMatMul(ElemType beta, const CPUMatrix& a, const bool transposeA, const int m, const CPUMatrix& b, const bool transposeB, const int n, CPUMatrix& c, const bool isColWise) +{ + if (a.IsEmpty() || b.IsEmpty()) + LogicError("BatchMatMul: one of the input matrices is empty."); + + if (!isColWise) + LogicError("Only column wise is supported."); + + const int aSampleElemNum = (int)a.GetNumRows(); + const int aBatchSize = (int)a.GetNumCols(); + const int bSampleElemNum = (int)b.GetNumRows(); + const int bBatchSize = (int)b.GetNumCols(); + + assert(aSampleElemNum > 0 && aBatchSize > 0 && bSampleElemNum > 0 && bBatchSize > 0); + if (aBatchSize != bBatchSize) + InvalidArgument("BatchMatMul: Matrices a and b should have same batch size."); + + int k = aSampleElemNum / m; + int kb = bSampleElemNum / n; + if (k != kb) + InvalidArgument("BatchMatMul: Matrices a's cols number should match Matrices b's rows number."); + + size_t cSampleElemNum = m * n; + + if (beta == 0) + c.RequireSize(cSampleElemNum, aBatchSize); + else + c.VerifySize(cSampleElemNum, aBatchSize); // Can't resize if beta != 0 + +#ifdef USE_OPENBLAS + int lda, ldb, ldc; + CBLAS_TRANSPOSE blasTransA; + CBLAS_TRANSPOSE blasTransB; + lda = transposeA ? k : m; + ldb = transposeB ? n : k; + blasTransA = transposeA ? CblasTrans : CblasNoTrans; + blasTransB = transposeB ? CblasTrans : CblasNoTrans; + ldc = m; + std::vector a_array; + std::vector b_array; + std::vector c_array; + a_array.reserve(aBatchSize); + b_array.reserve(aBatchSize); + c_array.reserve(aBatchSize); + ElemType* aBufPtr = a.Data(); + ElemType* bBufPtr = b.Data(); + ElemType* cBufPtr = c.Data(); + for (size_t i = 0; i < aBatchSize; i++) + { + a_array.push_back(aBufPtr + a.LocateColumn(i)); + b_array.push_back(bBufPtr + b.LocateColumn(i)); + c_array.push_back(cBufPtr + c.LocateColumn(i)); + } + for (size_t i = 0; i < aBatchSize; i++) + { + if (sizeof(ElemType) == sizeof(double)) + { + double alpha = 1.0; + cblas_dgemm((CBLAS_ORDER)(int)MatrixOrder::ColMajor, blasTransA, blasTransB, m, n, k, alpha, reinterpret_cast(a_array[i]), lda, reinterpret_cast(b_array[i]), ldb, double(beta), reinterpret_cast(c_array[i]), ldc); + } + else + { + float alpha = 1.0f; + cblas_sgemm((CBLAS_ORDER)(int)MatrixOrder::ColMajor, blasTransA, blasTransB, m, n, k, alpha, reinterpret_cast(a_array[i]), lda, reinterpret_cast(b_array[i]), ldb, float(beta), reinterpret_cast(c_array[i]), ldc); + } + } + +#else + std::vector m_array(aBatchSize, m); + std::vector n_array(aBatchSize, n); + std::vector k_array(aBatchSize, k); + std::vector lda_array(aBatchSize, transposeA ? k : m); + std::vector ldb_array(aBatchSize, transposeB ? n : k); + std::vector ldc_array(aBatchSize, m); + std::vector group_size(1, aBatchSize); + std::vector transa_array(aBatchSize, transposeA ? CblasTrans : CblasNoTrans); + std::vector transb_array(aBatchSize, transposeB ? CblasTrans : CblasNoTrans); + std::vector a_array; + std::vector b_array; + std::vector c_array; + a_array.reserve(aBatchSize); + b_array.reserve(aBatchSize); + c_array.reserve(aBatchSize); + ElemType* aBufPtr = a.Data(); + ElemType* bBufPtr = b.Data(); + ElemType* cBufPtr = c.Data(); + for (size_t i = 0; i < aBatchSize; i++) + { + a_array.push_back(aBufPtr + a.LocateColumn(i)); + b_array.push_back(bBufPtr + b.LocateColumn(i)); + c_array.push_back(cBufPtr + c.LocateColumn(i)); + } + + if (sizeof(ElemType) == sizeof(double)) + { + std::vector alpha_array(group_size[0], 1.0); + std::vector beta_array(group_size[0], double(beta)); + cblas_dgemm_batch(CblasColMajor, &transa_array[0], &transb_array[0], &m_array[0], &n_array[0], &k_array[0], &alpha_array[0], + reinterpret_cast(&a_array[0]), &lda_array[0], reinterpret_cast(&b_array[0]), &ldb_array[0], &beta_array[0], + reinterpret_cast(&c_array[0]), &ldc_array[0], 1, &group_size[0]); + } + else + { + std::vector alpha_array(group_size[0], 1.0f); + std::vector beta_array(group_size[0], float(beta)); + cblas_sgemm_batch(CblasColMajor, &transa_array[0], &transb_array[0], &m_array[0], &n_array[0], &k_array[0], &alpha_array[0], + reinterpret_cast(&a_array[0]), &lda_array[0], reinterpret_cast(&b_array[0]), &ldb_array[0], &beta_array[0], + reinterpret_cast(&c_array[0]), &ldc_array[0], 1, &group_size[0]); + } +#endif +} + template bool CPUMatrix::AreEqual(const CPUMatrix& a, const CPUMatrix& b, const ElemType threshold /*= 1e-8*/) { @@ -5893,7 +6071,7 @@ void CPUMatrix::TensorShuffleScaleAndAdd(ElemType keepWeight, const CP size_t nb = (((t * S + s) * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S x T): k/K and s/S swapped assert(nb < N); // perform the computation - ElemType cval = keepWeight ? keepWeight * pb[nb] : 0; // if weight is 0 then don't bother to read memory (efficiency) or to multiply (NaN-safe) + ElemType cval = keepWeight ? keepWeight * pb[nb] : (ElemType)0; // if weight is 0 then don't bother to read memory (efficiency) or to multiply (NaN-safe) cval += scaleFactor * pa[na]; pc[nb] = cval; } @@ -6035,7 +6213,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, ElemType* aBufPtr = a.Data(); ElemType* bBufPtr = b.Data(); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { for (long j = 0; j < n; j++) { @@ -6049,7 +6227,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, } } } - else + else if (std::is_same::value) { for (long j = 0; j < n; j++) { @@ -6063,6 +6241,10 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, } } } + else + { + RuntimeError("Unsupported data format"); + } } else { @@ -6072,7 +6254,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, ElemType* aBufPtr = a.Data(); ElemType* bBufPtr = b.Data(); - if (sizeof(ElemType) == sizeof(double)) + if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -6080,7 +6262,7 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, c(i, 0) = (ElemType) cblas_ddot(n, reinterpret_cast(aBufPtr + i), m, reinterpret_cast(bBufPtr + i), m); } } - else + else if (std::is_same::value) { #pragma omp parallel for foreach_row (i, c) @@ -6089,6 +6271,10 @@ void CPUMatrix::InnerProductWithShiftNeg(const CPUMatrix& a, c(i, 0) = cblas_sdot(n, reinterpret_cast(aBufPtr + i), m, reinterpret_cast(bBufPtr + i), m); } } + else + { + RuntimeError("Unsupported data format"); + } } } @@ -6271,9 +6457,9 @@ void CPUMatrix::RCRFBackwardCompute(const CPUMatrix& alpha, // Calculate alpha in forward-backward calculation. equation (6), (7) in ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf // GPU x dimension corresponds to utterances, y dimension corresponds to phone sequence in each utterance // prob (input): the posterior output from the network -// alpha (output): alpha for forward-backward calculation. -// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance -// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance +// alpha (output): alpha for forward-backward calculation. +// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance +// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance // uttToChanInd (input): map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance. // uttFrameNum (input): the frame number of each utterance. The size of this vector = the number of all utterances in this minibatch // uttBeginFrame(input): the position of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance. @@ -6878,388 +7064,20 @@ void CPUMatrix::SetCompatibleMode() // #endif } -// ======================================================================= -// TensorView support -// ======================================================================= - -// To save time, this makes extensive use of templates and macros. - -// ----------------------------------------------------------------------- -// function to compute the value for a given output location (perform reduction if needed) -// ----------------------------------------------------------------------- - -// perform loop over reduction index m -// This function is declared inside a wrapper struct to allow partial specialization (m = -1). -template -struct TensorOpReduction -{ - // reduction case (non-reduction case is specialized) - static inline ElemType Loop(array pointers, const OPFN& opfn, const ReductionOp& reductionOp, - const SmallVector& reducingOpDims, const array, N>& reducingStrides) - { - array strides; // N-1 because last one is the result pointer, which is unused in reduction - for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled - strides[i] = reducingStrides[i][(size_t) m]; - - double aggregate = TensorOpReduction::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides); - for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;) - { - // advance the pointers - for (size_t i = 0; i < N - 1; i++) - pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here - - // need to descend into one loop deeper - aggregate = reductionOp(aggregate, TensorOpReduction::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides)); - } - // Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation. - return static_cast(aggregate); - } -}; - -// perform loop over reduction index m -// This is the specialized version for m = -1, which terminates the recursion. -template -struct TensorOpReduction -{ - static inline ElemType Loop(array pointers, const OPFN& opfn, const ReductionOp& reductionOp, - const SmallVector&, const array, N>&) - { - return opfn(pointers); // finally we are doing some work!!! - } -}; - -// perform loop over reduction index m, while keeping track of the number of elements and their corresponding indices. -// This function is declared inside a wrapper struct to allow partial specialization (m = -1). -template -struct TensorArgOpReduction -{ - static inline std::pair ReduceAll(array pointers, const SmallVector& reducingOpDims, const array, N>& reducingStrides, - ElementWiseOperator reductionOp) - { - size_t counter = 0; - size_t index = 0; - ElemType val = (ElemType)0; - - switch (reducingOpDims.size()) - { - case 3: - val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); - break; - case 2: - val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); - break; - case 1: - val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); - break; - case 0: - val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); - break; - default: - LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)reducingOpDims.size()); - } - - return make_pair(val, index); - } - - // reduction case (non-reduction case is specialized) - static inline ElemType Loop(array pointers, const SmallVector& reducingOpDims, const array, N>& reducingStrides, - ElementWiseOperator reductionOp, size_t& counter, size_t& index) - { - array strides; // N-1 because last one is the result pointer, which is unused in reduction - for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled - strides[i] = reducingStrides[i][(size_t)m]; - - ElemType aggregate = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); - for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;) - { - // advance the pointers - for (size_t i = 0; i < N - 1; i++) - pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here - - ElemType val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); - - bool update = false; - switch (reductionOp) - { - case ElementWiseOperator::opArgmin: - update = (aggregate > val); - break; - case ElementWiseOperator::opArgmax: - update = (aggregate < val); - break; - } - - if (update) - { - aggregate = val; - index = counter - 1; - } - } - - return aggregate; - } -}; - -// perform loop over reduction index m -// This is the specialized version for m = -1, which terminates the recursion. -template -struct TensorArgOpReduction -{ - static inline ElemType Loop(array pointers, - const SmallVector&, const array, N>&, ElementWiseOperator reductionOp, size_t& counter, size_t& index) - { - counter++; - return *pointers[0]; // finally we are doing some work!!! - } -}; - -// ----------------------------------------------------------------------- -// perform loop over regular index k for N-nary operations (N counting the output) -// ----------------------------------------------------------------------- - -// perform loop over regular index k and reducing index m for N operands (counting the output) -template -struct TensorOpIteration -{ - static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, - const SmallVector& regularOpDims, const array, N>& regularStrides, - const SmallVector& reducingOpDims, const array, N>& reducingStrides) - { - // non-scalar case: still nested result loops left - array strides; - for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled - strides[i] = regularStrides[i][(size_t) k]; - for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;) - { - // need to descend into one loop deeper - TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - // advance the pointers - for (size_t i = 0; i < N; i++) - pointers[i] += strides[i]; - } - } -}; - -// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE. -// This is a very common case, e.g. adding vectors or computing the Sigmoid. -template -struct TensorOpIteration -{ - static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, - const SmallVector& regularOpDims, const array, 3>& regularStrides, - const SmallVector& reducingOpDims, const array, 3>& reducingStrides) - { - ElemType* pa = pointers[0]; - ElemType* pb = pointers[1]; - ElemType* pc = pointers[2]; - size_t K = regularOpDims[0]; - // special-case beta and alpha to allow the compiler to short-circuit it - if (beta != 0) -#pragma omp parallel for - for (int k = 0; k < (int) K; k++) - TensorOpIteration::Loop(beta, array{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - else if (alpha != 1) -#pragma omp parallel for - for (int k = 0; k < (int) K; k++) - TensorOpIteration::Loop(0, array{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - else -#pragma omp parallel for - for (int k = 0; k < (int) K; k++) - TensorOpIteration::Loop(0, array{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - // TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default). - // TODO: The signedness of k (required for omp) causes an extra sign-extend. - // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it? - } -}; -// and unary -template -struct TensorOpIteration -{ - static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, - const SmallVector& regularOpDims, const array, 2>& regularStrides, - const SmallVector& reducingOpDims, const array, 2>& reducingStrides) - { - ElemType* pa = pointers[0]; - ElemType* pb = pointers[1]; - size_t K = regularOpDims[0]; - // special-case beta and alpha to allow the compiler to short-circuit it - if (beta != 0) -#pragma omp parallel for - for (int k = 0; k < (int) K; k++) - TensorOpIteration::Loop(beta, array{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - else if (alpha != 1) -#pragma omp parallel for - for (int k = 0; k < (int) K; k++) - TensorOpIteration::Loop(0, array{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - else -#pragma omp parallel for - for (int k = 0; k < (int) K; k++) - TensorOpIteration::Loop(0, array{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - } -}; - -template -struct TensorOpIteration -{ - static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, - const SmallVector&, const array, N>&, - const SmallVector& reducingOpDims, const array, N>& reducingStrides) - { - // we are at element level for the result: perform the op (there may still be reduction) - ElemType val = TensorOpReduction::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides); - // scale - val *= alpha; - // combine with previous value in target matrix, then write it out - auto* pout = pointers.back(); - if (beta != 0) - val += beta * *pout; - // save - *pout = val; - return; - } -}; - -// perform loop over regular index k and reducing index m for N operands (counting the output), the difference -// between TensorOpIteration and TensorArgOpIteration, is that the latter store the index of the result, instead of -// the result. The reason that they aren't combined is because of performance. -template -struct TensorArgOpIteration +template +void CPUMatrix::SetOptimizationFlags(int flags) { - static inline void Loop(array pointers, - const SmallVector& regularOpDims, const array, N>& regularStrides, - const SmallVector& reducingOpDims, const array, N>& reducingStrides, ElementWiseOperator reductionOp) - { - // non-scalar case: still nested result loops left - array strides; - for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled - strides[i] = regularStrides[i][(size_t)k]; - for (size_t dim = regularOpDims[(size_t)k]; dim-- > 0;) - { - // need to descend into one loop deeper - TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); - // advance the pointers - for (size_t i = 0; i < N; i++) - pointers[i] += strides[i]; - } - } -}; + m_optimizationFlags = flags; +} -template -struct TensorArgOpIteration +template +int CPUMatrix::GetOptimizationFlags() { - static inline void Loop(array pointers, - const SmallVector&, const array, N>&, - const SmallVector& reducingOpDims, const array, N>& reducingStrides, ElementWiseOperator reductionOp) - { - // we are at element level for the result: perform the op (there may still be reduction) - auto val = TensorArgOpReduction::ReduceAll(pointers, reducingOpDims, reducingStrides, reductionOp); - - auto* pout = pointers.back(); - *pout = (ElemType)val.second; - return; - } -}; - -// ----------------------------------------------------------------------- -// map runtime parameters N to template parameters -// ----------------------------------------------------------------------- - -// tensor operation with k+1 dimensions (-1 means scalar) -template -static void TensorOpWithRegularLoop(ElemType beta, const array& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp, - const SmallVector& regularOpDims, const array, N>& regularStrides, - const SmallVector& reducingOpDims, const array, N>& reducingStrides) -{ - size_t dims = reducingOpDims.size(); - switch (dims) - { - case 2: - return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 1: - return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 0: - { - // if all leading dimensions are 1, we can let the compiler do some unrolling - bool leadingAllOne = true; - for (size_t i = 0; i < N; i++) - leadingAllOne &= k >= 0 && regularStrides[i][0] == 1; - if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions - return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - else - return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - } - default: - LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims); - } -} - -// tensor operation, generalized in number of arguments, operation already provided as a lambda -// This function now expands into different k. -template -static void TensorOpWithFnAndReduction(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, - const array& offsets, - const SmallVector& regularOpDims, const array, N>& regularStrides, - const SmallVector& reducingOpDims, const array, N>& reducingStrides) -{ - for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled - pointers[i] += offsets[i]; - size_t dims = regularOpDims.size(); - switch (dims) - { - // N.B. consider code size impact when adding more cases. - case 5: - return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 4: - return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 3: - return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 2: - return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 1: - return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - case 0: - return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); - default: - LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims); - } -} - -// tensor operation, generalized in number of arguments, operation already provided as a lambda -// This function now expands into different reductionOps -template -static void TensorOpWithFn(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp, - const array& offsets, - const SmallVector& regularOpDims, const array, N>& regularStrides, - const SmallVector& reducingOpDims, const array, N>& reducingStrides) -{ -// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we -// used double for aggregator of sum. But: -// * for min and max reductions this is meaningless. -// * It is not consitent with what we do on GPU, there we aggregate on ElemType. -// * It costs performance. -// TODO: apdapt e2e tests to run with aggregator of type ElemType. -#define CaseTensorOpWithFnAndReduction(oper) \ - case ElementWiseOperator::op##oper: \ - return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b) \ - { \ - return Op##oper(a, b); \ - }, \ - offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) - - switch (reductionOp) - { - CaseTensorOpWithFnAndReduction(Sum); - CaseTensorOpWithFnAndReduction(LogSum); - CaseTensorOpWithFnAndReduction(Min); - CaseTensorOpWithFnAndReduction(Max); - CaseTensorOpWithFnAndReduction(ElementwiseProduct); - default: - LogicError("Specified ElementWiseOperator op %d not supported as reduction operation.", (int)reductionOp); - } + return m_optimizationFlags; } // ----------------------------------------------------------------------- -// entry points from Matrix.cpp; also map op to a lambda +// entry points from Matrix.cpp; calls into CPUMatrixTensorOpImpl // ----------------------------------------------------------------------- // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides @@ -7270,29 +7088,7 @@ void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const SmallVector& regularOpDims, const array, 2>& regularStrides, const SmallVector& reducingOpDims, const array, 2>& reducingStrides) { - if (reductionOp != ElementWiseOperator::opSum && - reductionOp != ElementWiseOperator::opLogSum && - reductionOp != ElementWiseOperator::opMin && - reductionOp != ElementWiseOperator::opMax && - reductionOp != ElementWiseOperator::opElementwiseProduct) - InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented."); - -// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize. -#define CaseUnaryTensorOp(oper) \ - case ElementWiseOperator::op##oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array& pp) \ - { \ - return Op##oper((*(pp[0]))); \ - }, \ - reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) - - array pointers = {a.Data(), Data()}; - switch (op) - { - ForAllUnaryOps(CaseUnaryTensorOp); - default: - LogicError("TensorOp: Unknown unary op code %d.", (int) op); - } + CPUMatrixTensorOpImpl(beta, a, *this, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides @@ -7303,24 +7099,7 @@ void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const SmallVector& regularOpDims, const array, 3>& regularStrides, const SmallVector& reducingOpDims, const array, 3>& reducingStrides) { - if (reductionOp != ElementWiseOperator::opSum) - InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum."); - -#define CaseBinaryTensorOp(oper) \ - case ElementWiseOperator::op##oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array& pp) \ - { \ - return Op##oper((*(pp[0])), (*(pp[1]))); \ - }, \ - reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) - - array pointers = {a.Data(), b.Data(), Data()}; - switch (op) - { - ForAllBinaryOps(CaseBinaryTensorOp); - default: - LogicError("TensorOp: Unknown op binary code %d.", (int) op); - } + CPUMatrixTensorOpImpl(beta, a, b, *this, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides @@ -7331,24 +7110,7 @@ void CPUMatrix::TensorOp(ElemType beta, const CPUMatrix& a, const SmallVector& regularOpDims, const array, 4>& regularStrides, const SmallVector& reducingOpDims, const array, 4>& reducingStrides) { - if (reductionOp != ElementWiseOperator::opSum) - InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum."); - -#define CaseTernaryTensorOp(oper) \ - case ElementWiseOperator::op##oper: \ - return TensorOpWithFn(beta, pointers, alpha, [](const array& pp) \ - { \ - return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \ - }, \ - reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) - - array pointers = {a.Data(), b.Data(), c.Data(), Data()}; - switch (op) - { - ForAllTernaryOps(CaseTernaryTensorOp); - default: - LogicError("TensorOp: Unknown ternary op code %d.", (int) op); - } + CPUMatrixTensorOpImpl(beta, a, b, c, *this, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } template @@ -7357,7 +7119,7 @@ int CPUMatrix::Argmin() const int minArg = -1; ElemType minValue = std::numeric_limits::max(); -#pragma omp parallel +#pragma omp parallel { int localMinArg = -1; ElemType localMinValue = std::numeric_limits::max(); @@ -7400,7 +7162,7 @@ int CPUMatrix::Argmax() const int maxArg = -1; ElemType maxValue = std::numeric_limits::lowest(); -#pragma omp parallel +#pragma omp parallel { int localMaxArg = -1; ElemType localMaxValue = std::numeric_limits::lowest(); @@ -7460,36 +7222,7 @@ void CPUMatrix::TensorArgOp(const CPUMatrix& a, ElementWiseO const SmallVector& regularOpDims, const array, 2>& regularStrides, const SmallVector& reducingOpDims, const array, 2>& reducingStrides) { - if (reductionOp != ElementWiseOperator::opArgmin && - reductionOp != ElementWiseOperator::opArgmax) - InvalidArgument("TensorOp: Arg reduction operations other than opArgmax, and opArgmin are not implemented."); - - if (GetNumElements() == 1) - { - Data()[0] = (ElemType) a.ArgOp(reductionOp); - } - else - { - const size_t N = 2; - array pointers = { a.Data(), Data() }; - for (size_t i = 0; i < N; i++) - pointers[i] += offsets[i]; - - switch (regularOpDims.size()) - { - case 2: - TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); - break; - case 1: - TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); - break; - case 0: - TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); - break; - default: - LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)regularOpDims.size()); - } - } + CPUMatrixTensorArgOpImpl(a, *this, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides); } template @@ -7511,7 +7244,7 @@ void CPUMatrix::ScatterValues(ElemType* indices, ElemType* value, Elem //ignore the elements that is not partitioned into this thread if (col % nthread != ithread) continue; - + if (col >= cols) InvalidArgument("ScatterValues: Indices map out of bounds. %ld >= %ld", (long int)col, (long int)cols); @@ -7569,4 +7302,3 @@ template void CPUMatrix::Reshape(const size_t, const size_t); template CPUMatrix::CPUMatrix(const size_t, const size_t, int*, const size_t); }}} - diff --git a/Source/Math/CPUMatrixTensorDouble.cpp b/Source/Math/CPUMatrixTensorDouble.cpp new file mode 100644 index 00000000000..2a087600534 --- /dev/null +++ b/Source/Math/CPUMatrixTensorDouble.cpp @@ -0,0 +1,30 @@ +#include "stdafx.h" +#include "CPUMatrixTensorImpl.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +template +void CPUMatrixTensorOpImpl(double beta, const CPUMatrix& a, CPUMatrix& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(double beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(double beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, CPUMatrix& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 4>& regularStrides, + const SmallVector& reducingOpDims, const array, 4>& reducingStrides); + +template +void CPUMatrixTensorArgOpImpl(const CPUMatrix& a, CPUMatrix& o, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +}}} \ No newline at end of file diff --git a/Source/Math/CPUMatrixTensorFloat.cpp b/Source/Math/CPUMatrixTensorFloat.cpp new file mode 100644 index 00000000000..6530535fe2c --- /dev/null +++ b/Source/Math/CPUMatrixTensorFloat.cpp @@ -0,0 +1,30 @@ +#include "stdafx.h" +#include "CPUMatrixTensorImpl.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +template +void CPUMatrixTensorOpImpl(float beta, const CPUMatrix& a, CPUMatrix& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(float beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(float beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, CPUMatrix& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 4>& regularStrides, + const SmallVector& reducingOpDims, const array, 4>& reducingStrides); + +template +void CPUMatrixTensorArgOpImpl(const CPUMatrix& a, CPUMatrix& o, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +}}} \ No newline at end of file diff --git a/Source/Math/CPUMatrixTensorHalf.cpp b/Source/Math/CPUMatrixTensorHalf.cpp new file mode 100644 index 00000000000..5261da618de --- /dev/null +++ b/Source/Math/CPUMatrixTensorHalf.cpp @@ -0,0 +1,30 @@ +#include "stdafx.h" +#include "CPUMatrixTensorImpl.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +template +void CPUMatrixTensorOpImpl(half beta, const CPUMatrix& a, CPUMatrix& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(half beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides); + +template +void CPUMatrixTensorOpImpl(half beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, CPUMatrix& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 4>& regularStrides, + const SmallVector& reducingOpDims, const array, 4>& reducingStrides); + +template +void CPUMatrixTensorArgOpImpl(const CPUMatrix& a, CPUMatrix& o, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +}}} \ No newline at end of file diff --git a/Source/Math/CPUMatrixTensorImpl.h b/Source/Math/CPUMatrixTensorImpl.h new file mode 100644 index 00000000000..74059911787 --- /dev/null +++ b/Source/Math/CPUMatrixTensorImpl.h @@ -0,0 +1,556 @@ +// Move some files out of CPUMatrixImpl.h to prevent compiler crash on out-of-heap + +#include "CPUMatrix.h" +#include "TensorOps.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +// ======================================================================= +// TensorView support +// ======================================================================= + +// To save time, this makes extensive use of templates and macros. + +// ----------------------------------------------------------------------- +// function to compute the value for a given output location (perform reduction if needed) +// ----------------------------------------------------------------------- + +// perform loop over reduction index m +// This function is declared inside a wrapper struct to allow partial specialization (m = -1). +template +struct TensorOpReduction +{ + // reduction case (non-reduction case is specialized) + static inline ElemType Loop(array pointers, const OPFN& opfn, const ReductionOp& reductionOp, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) + { + array strides; // N-1 because last one is the result pointer, which is unused in reduction + for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled + strides[i] = reducingStrides[i][(size_t) m]; + + double aggregate = TensorOpReduction::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides); + for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;) + { + // advance the pointers + for (size_t i = 0; i < N - 1; i++) + pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here + + // need to descend into one loop deeper + aggregate = reductionOp(aggregate, TensorOpReduction::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides)); + } + // Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation. + return static_cast(aggregate); + } +}; + +// perform loop over reduction index m +// This is the specialized version for m = -1, which terminates the recursion. +template +struct TensorOpReduction +{ + static inline ElemType Loop(array pointers, const OPFN& opfn, const ReductionOp& /*reductionOp*/, + const SmallVector&, const array, N>&) + { + return opfn(pointers); // finally we are doing some work!!! + } +}; + +// perform loop over reduction index m, while keeping track of the number of elements and their corresponding indices. +// This function is declared inside a wrapper struct to allow partial specialization (m = -1). +template +struct TensorArgOpReduction +{ + static inline std::pair ReduceAll(array pointers, const SmallVector& reducingOpDims, const array, N>& reducingStrides, + ElementWiseOperator reductionOp) + { + size_t counter = 0; + size_t index = 0; + ElemType val = (ElemType)0; + + switch (reducingOpDims.size()) + { + case 3: + val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); + break; + case 2: + val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); + break; + case 1: + val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); + break; + case 0: + val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); + break; + default: + LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)reducingOpDims.size()); + } + + return make_pair(val, index); + } + + // reduction case (non-reduction case is specialized) + static inline ElemType Loop(array pointers, const SmallVector& reducingOpDims, const array, N>& reducingStrides, + ElementWiseOperator reductionOp, size_t& counter, size_t& index) + { + array strides; // N-1 because last one is the result pointer, which is unused in reduction + for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled + strides[i] = reducingStrides[i][(size_t)m]; + + ElemType aggregate = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); + for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;) + { + // advance the pointers + for (size_t i = 0; i < N - 1; i++) + pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here + + ElemType val = TensorArgOpReduction::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index); + + bool update = false; + switch (reductionOp) + { + case ElementWiseOperator::opArgmin: + update = (aggregate > val); + break; + case ElementWiseOperator::opArgmax: + update = (aggregate < val); + break; + } + + if (update) + { + aggregate = val; + index = counter - 1; + } + } + + return aggregate; + } +}; + +// perform loop over reduction index m +// This is the specialized version for m = -1, which terminates the recursion. +template +struct TensorArgOpReduction +{ + static inline ElemType Loop(array pointers, + const SmallVector&, const array, N>&, ElementWiseOperator /*reductionOp*/, size_t& counter, size_t& /*index*/) + { + counter++; + return *pointers[0]; // finally we are doing some work!!! + } +}; + +// ----------------------------------------------------------------------- +// perform loop over regular index k for N-nary operations (N counting the output) +// ----------------------------------------------------------------------- + +// perform loop over regular index k and reducing index m for N operands (counting the output) +template +struct TensorOpIteration +{ + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) + { + // non-scalar case: still nested result loops left + array strides; + for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + strides[i] = regularStrides[i][(size_t) k]; + for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;) + { + // need to descend into one loop deeper + TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + // advance the pointers + for (size_t i = 0; i < N; i++) + pointers[i] += strides[i]; + } + } +}; + +// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE. +// This is a very common case, e.g. adding vectors or computing the Sigmoid. +template +struct TensorOpIteration +{ + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides) + { + ElemType* pa = pointers[0]; + ElemType* pb = pointers[1]; + ElemType* pc = pointers[2]; + size_t K = regularOpDims[0]; + // special-case beta and alpha to allow the compiler to short-circuit it + if (beta != 0) +#pragma omp parallel for + for (int k = 0; k < (int) K; k++) + TensorOpIteration::Loop(beta, array{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else if (alpha != 1) +#pragma omp parallel for + for (int k = 0; k < (int) K; k++) + TensorOpIteration::Loop(0, array{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else +#pragma omp parallel for + for (int k = 0; k < (int) K; k++) + TensorOpIteration::Loop(0, array{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + // TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default). + // TODO: The signedness of k (required for omp) causes an extra sign-extend. + // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it? + } +}; +// and unary +template +struct TensorOpIteration +{ + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides) + { + ElemType* pa = pointers[0]; + ElemType* pb = pointers[1]; + size_t K = regularOpDims[0]; + // special-case beta and alpha to allow the compiler to short-circuit it + if (beta != 0) +#pragma omp parallel for + for (int k = 0; k < (int) K; k++) + TensorOpIteration::Loop(beta, array{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else if (alpha != 1) +#pragma omp parallel for + for (int k = 0; k < (int) K; k++) + TensorOpIteration::Loop(0, array{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else +#pragma omp parallel for + for (int k = 0; k < (int) K; k++) + TensorOpIteration::Loop(0, array{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + } +}; + +template +struct TensorOpIteration +{ + static inline void Loop(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, + const SmallVector&, const array, N>&, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) + { + // we are at element level for the result: perform the op (there may still be reduction) + ElemType val = TensorOpReduction::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides); + // scale + val *= alpha; + // combine with previous value in target matrix, then write it out + auto* pout = pointers.back(); + if (beta != 0) + val += beta * *pout; + // save + *pout = val; + return; + } +}; + +// perform loop over regular index k and reducing index m for N operands (counting the output), the difference +// between TensorOpIteration and TensorArgOpIteration, is that the latter store the index of the result, instead of +// the result. The reason that they aren't combined is because of performance. +template +struct TensorArgOpIteration +{ + static inline void Loop(array pointers, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides, ElementWiseOperator reductionOp) + { + // non-scalar case: still nested result loops left + array strides; + for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + strides[i] = regularStrides[i][(size_t)k]; + for (size_t dim = regularOpDims[(size_t)k]; dim-- > 0;) + { + // need to descend into one loop deeper + TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); + // advance the pointers + for (size_t i = 0; i < N; i++) + pointers[i] += strides[i]; + } + } +}; + +template +struct TensorArgOpIteration +{ + static inline void Loop(array pointers, + const SmallVector&, const array, N>&, + const SmallVector& reducingOpDims, const array, N>& reducingStrides, ElementWiseOperator reductionOp) + { + // we are at element level for the result: perform the op (there may still be reduction) + auto val = TensorArgOpReduction::ReduceAll(pointers, reducingOpDims, reducingStrides, reductionOp); + + auto* pout = pointers.back(); + *pout = (ElemType)val.second; + return; + } +}; + +// ----------------------------------------------------------------------- +// map runtime parameters N to template parameters +// ----------------------------------------------------------------------- + +// tensor operation with k+1 dimensions (-1 means scalar) +template +static void TensorOpWithRegularLoop(ElemType beta, const array& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) +{ + size_t dims = reducingOpDims.size(); + switch (dims) + { + case 2: + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: + { + // if all leading dimensions are 1, we can let the compiler do some unrolling + bool leadingAllOne = true; + for (size_t i = 0; i < N; i++) + leadingAllOne &= k >= 0 && regularStrides[i][0] == 1; + if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + else + return TensorOpIteration::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + } + default: + LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims); + } +} + +// tensor operation, generalized in number of arguments, operation already provided as a lambda +// This function now expands into different k. +template +static void TensorOpWithFnAndReduction(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) +{ + for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled + pointers[i] += offsets[i]; + size_t dims = regularOpDims.size(); + switch (dims) + { + // N.B. consider code size impact when adding more cases. + case 5: + return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 4: + return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 3: + return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 2: + return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 1: + return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + case 0: + return TensorOpWithRegularLoop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides); + default: + LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims); + } +} + +// tensor operation, generalized in number of arguments, operation already provided as a lambda +// This function now expands into different reductionOps +template +static void TensorOpWithFn(ElemType beta, array pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, N>& regularStrides, + const SmallVector& reducingOpDims, const array, N>& reducingStrides) +{ +// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we +// used double for aggregator of sum. But: +// * for min and max reductions this is meaningless. +// * It is not consitent with what we do on GPU, there we aggregate on ElemType. +// * It costs performance. +// TODO: apdapt e2e tests to run with aggregator of type ElemType. +#define CaseTensorOpWithFnAndReduction(oper) \ + case ElementWiseOperator::op##oper: \ + return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b) \ + { \ + return Op##oper(a, b); \ + }, \ + offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + + switch (reductionOp) + { + CaseTensorOpWithFnAndReduction(Sum); + CaseTensorOpWithFnAndReduction(LogSum); + CaseTensorOpWithFnAndReduction(Min); + CaseTensorOpWithFnAndReduction(Max); + CaseTensorOpWithFnAndReduction(ElementwiseProduct); + default: + LogicError("Specified ElementWiseOperator op %d not supported as reduction operation.", (int)reductionOp); + } +} + +// ----------------------------------------------------------------------- +// entry points from Matrix.cpp; also map op to a lambda +// ----------------------------------------------------------------------- + +// special tensor ops for inference speed +template +bool CPUMatrixSpecialUnaryTensorOpImpl(ElemType beta, const CPUMatrix& a, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides); + +template +bool CPUMatrixSpecialBinaryTensorOpImpl(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides); + +template +bool CPUMatrixSpecialTernaryTensorOpImpl(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 4>& regularStrides, + const SmallVector& reducingOpDims, const array, 4>& reducingStrides); + +// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides +// This maps 'op' to a lambda. +template +void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix& a, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides) +{ + if (reductionOp != ElementWiseOperator::opSum && + reductionOp != ElementWiseOperator::opLogSum && + reductionOp != ElementWiseOperator::opMin && + reductionOp != ElementWiseOperator::opMax && + reductionOp != ElementWiseOperator::opElementwiseProduct) + InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented."); + +#ifdef USE_MKL + if (!!(CPUMatrix::GetOptimizationFlags() & CPUMatrix::OPT_EVAL_WITH_MKL) && + CPUMatrixSpecialUnaryTensorOpImpl(beta, a, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)) + return; +#endif + +// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize. +#define CaseUnaryTensorOp(oper) \ + case ElementWiseOperator::op##oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array& pp) \ + { \ + return Op##oper((*(pp[0]))); \ + }, \ + reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + + array pointers = {a.Data(), o.Data()}; + switch (op) + { + ForAllUnaryOps(CaseUnaryTensorOp); + default: + LogicError("TensorOp: Unknown unary op code %d.", (int) op); + } +} + +// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides +// This maps 'op' to a lambda. +template +void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& reducingStrides) +{ + if (reductionOp != ElementWiseOperator::opSum) + InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum."); + +#ifdef USE_MKL + if (!!(CPUMatrix::GetOptimizationFlags() & CPUMatrix::OPT_EVAL_WITH_MKL) && + CPUMatrixSpecialBinaryTensorOpImpl(beta, a, b, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)) + return; +#endif + +#define CaseBinaryTensorOp(oper) \ + case ElementWiseOperator::op##oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array& pp) \ + { \ + return Op##oper((*(pp[0])), (*(pp[1]))); \ + }, \ + reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + + array pointers = {a.Data(), b.Data(), o.Data()}; + switch (op) + { + ForAllBinaryOps(CaseBinaryTensorOp); + default: + LogicError("TensorOp: Unknown op binary code %d.", (int) op); + } +} + +// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides +// This maps 'op' to a lambda. +template +void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, CPUMatrix& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 4>& regularStrides, + const SmallVector& reducingOpDims, const array, 4>& reducingStrides) +{ + if (reductionOp != ElementWiseOperator::opSum) + InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum."); + +#ifdef USE_MKL + if (!!(CPUMatrix::GetOptimizationFlags() & CPUMatrix::OPT_EVAL_WITH_MKL) && + CPUMatrixSpecialTernaryTensorOpImpl(beta, a, b, c, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)) + return; +#endif + +#define CaseTernaryTensorOp(oper) \ + case ElementWiseOperator::op##oper: \ + return TensorOpWithFn(beta, pointers, alpha, [](const array& pp) \ + { \ + return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \ + }, \ + reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides) + + array pointers = {a.Data(), b.Data(), c.Data(), o.Data()}; + switch (op) + { + ForAllTernaryOps(CaseTernaryTensorOp); + default: + LogicError("TensorOp: Unknown ternary op code %d.", (int) op); + } +} + +template +void CPUMatrixTensorArgOpImpl(const CPUMatrix& a, CPUMatrix& o, ElementWiseOperator reductionOp, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& reducingStrides) +{ + if (reductionOp != ElementWiseOperator::opArgmin && + reductionOp != ElementWiseOperator::opArgmax) + InvalidArgument("TensorOp: Arg reduction operations other than opArgmax, and opArgmin are not implemented."); + + if (o.GetNumElements() == 1) + { + o.Data()[0] = (ElemType) a.ArgOp(reductionOp); + } + else + { + const size_t N = 2; + array pointers = { a.Data(), o.Data() }; + for (size_t i = 0; i < N; i++) + pointers[i] += offsets[i]; + + switch (regularOpDims.size()) + { + case 2: + TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); + break; + case 1: + TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); + break; + case 0: + TensorArgOpIteration::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp); + break; + default: + LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)regularOpDims.size()); + } + } +} + +}}} \ No newline at end of file diff --git a/Source/Math/CPUMatrixTensorSpecial.cpp b/Source/Math/CPUMatrixTensorSpecial.cpp new file mode 100644 index 00000000000..e1dcba149a8 --- /dev/null +++ b/Source/Math/CPUMatrixTensorSpecial.cpp @@ -0,0 +1,236 @@ +#include "stdafx.h" + +#ifdef USE_MKL + +#include "CPUMatrixTensorImpl.h" +#include "mkl_cblas.h" +#include "mkl_vml.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + +template<> +bool CPUMatrixSpecialUnaryTensorOpImpl(float beta, const CPUMatrix& a, CPUMatrix& o, float alpha, ElementWiseOperator op, ElementWiseOperator /*reductionOp*/, + const array& offsets, + const SmallVector& regularOpDims, const array, 2>& regularStrides, + const SmallVector& reducingOpDims, const array, 2>& /*reducingStrides*/) +{ + if (alpha == 1.0f && beta == 0.0f && // for inference + reducingOpDims.size() == 0 && // no reduction + regularStrides[0] == regularStrides[1]) // input/output have the same strides + { + // check if it is elementwise operation with 1:1 input/output mapping and no gap + size_t count = 1; + for (int rank = 0; rank < regularOpDims.size(); ++ rank) + { + // 0 stride can only be in the last rank + if (regularStrides[0][rank] == 0 && rank != regularStrides[0].size() - 1) + return false; + + // if not continuous in memory, don't optimize + if ((ptrdiff_t)count != regularStrides[0][rank] || regularStrides[0][rank] == 0) + return false; + + count *= regularOpDims[rank]; + } + + float* pA = a.Data() + offsets[0]; + float* pO = o.Data() + offsets[1]; + + switch (op) + { + case ElementWiseOperator::opLinearRectifier: + if (pA != pO) + { + vsAbs((int)count, pA, pO); + cblas_saxpby((int)count, 0.5f, pA, 1, 0.5f, pO, 1); // o = (a + abs(a))/2 + return true; + } + } + } + return false; +} + +template<> +bool CPUMatrixSpecialBinaryTensorOpImpl(float beta, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& o, float alpha, ElementWiseOperator op, ElementWiseOperator /*reductionOp*/, + const array& offsets, + const SmallVector& regularOpDims, const array, 3>& regularStrides, + const SmallVector& reducingOpDims, const array, 3>& /*reducingStrides*/) +{ + if (alpha == 1.0f && beta == 0.0f && // for inference + reducingOpDims.size() == 0 && // no reduction + (regularStrides[0] == regularStrides[2] || + regularStrides[1] == regularStrides[2])) // one of the inputs has same strides as output + { + // only support simple broadcasting case + if (regularStrides[0].size() != regularStrides[1].size()) + { + for (int rank = 0; rank < std::min(regularStrides[0].size(), regularStrides[1].size()); ++rank) + { + if (regularStrides[0][rank] != regularStrides[1][rank]) + return false; + } + } + + // MKL based optimization on scalar/vector, vector/vector, and matrix/vector operations + + size_t elementCount[3] = { 1, 1, 1 }; // element count for a/b/o + for (int rank = 0; rank < regularOpDims.size(); ++ rank) + { + for (int iOp = 0; iOp < _countof(elementCount); ++ iOp) + { + // 0 stride can only be in the last rank + if (regularStrides[iOp][rank] == 0 && rank != regularStrides[iOp].size() - 1) + return false; + + if (rank >= regularStrides[iOp].size() || regularStrides[iOp][rank] == 0) continue; + + // if not continuous in memory, don't optimize + if (regularStrides[iOp][rank] != (ptrdiff_t)elementCount[iOp]) + return false; + + elementCount[iOp] *= regularOpDims[rank]; + } + } + size_t aN = elementCount[0]; + size_t bN = elementCount[1]; + size_t oN = elementCount[2]; + float* pA = a.Data() + offsets[0]; + float* pB = b.Data() + offsets[1]; + float* pO = o.Data() + offsets[2]; + int count = (int)oN; + + // scalar/vector + if ((aN == oN && bN == 1) || (bN == oN && aN == 1)) + { + float scalar = (aN == 1 ? pA[0] : pB[0]); + float* input = (aN == 1 ? pB : pA); + + if (input != pO) + memcpy(pO, input, count * sizeof(float)); + + switch (op) + { + case ElementWiseOperator::opElementwiseProduct: + cblas_sscal(count, scalar, pO, 1); + return true; + case ElementWiseOperator::opSum: + cblas_saxpby(count, 1.0f, &scalar, 0, 1.0f, pO, 1); + return true; + case ElementWiseOperator::opDifference: + if (input == pA) + cblas_saxpby(count, -1.0f, &scalar, 0, 1.0f, pO, 1); + else + cblas_saxpby(count, 1.0f, &scalar, 0, -1.0f, pO, 1); + return true; + } + } + // vector/vector (elementwise 1:1) + else if (aN == oN && bN == oN) + { + // elementwise operation with no broadcast/reduction + switch (op) + { + case ElementWiseOperator::opSum: + vsAdd(count, pA, pB, pO); + return true; + case ElementWiseOperator::opElementwiseProduct: + vsMul(count, pA, pB, pO); + return true; + case ElementWiseOperator::opDifference: + vsSub(count, pA, pB, pO); + return true; + } + } + // vector/matrix, i.e. plus/multiply parameter + else if (std::max(aN, bN) == oN) + { + float* pMat = (aN < bN ? pB : pA); + float* pVec = (aN < bN ? pA : pB); + int vecN = (int)std::min(aN, bN); + int numVec = (int)(oN / vecN); + switch (op) + { + case ElementWiseOperator::opSum: + for (int i = 0; i < numVec; ++i) + { + vsAdd(vecN, pMat + i * vecN, pVec, pO + i * vecN); + } + return true; + case ElementWiseOperator::opElementwiseProduct: + for (int i = 0; i < numVec; ++i) + { + vsMul(vecN, pMat + i * vecN, pVec, pO + i * vecN); + } + return true; + } + } + } + return false; +} + +template<> +bool CPUMatrixSpecialTernaryTensorOpImpl(float /*beta*/, const CPUMatrix& /*a*/, const CPUMatrix& /*b*/, const CPUMatrix& /*c*/, CPUMatrix& /*o*/, float /*alpha*/, ElementWiseOperator /*op*/, ElementWiseOperator /*reductionOp*/, + const array& /*offsets*/, + const SmallVector& /*regularOpDims*/, const array, 4>& /*regularStrides*/, + const SmallVector& /*reducingOpDims*/, const array, 4>& /*reducingStrides*/) +{ + return false; +} + +template<> +bool CPUMatrixSpecialUnaryTensorOpImpl(double, const CPUMatrix&, CPUMatrix&, double, ElementWiseOperator, ElementWiseOperator, + const array&, + const SmallVector&, const array, 2>&, + const SmallVector&, const array, 2>&) +{ + return false; +} + +template<> +bool CPUMatrixSpecialBinaryTensorOpImpl(double, const CPUMatrix&, const CPUMatrix&, CPUMatrix&, double, ElementWiseOperator, ElementWiseOperator, + const array&, + const SmallVector&, const array, 3>&, + const SmallVector&, const array, 3>&) +{ + return false; +} + +template<> +bool CPUMatrixSpecialTernaryTensorOpImpl(double, const CPUMatrix&, const CPUMatrix&, const CPUMatrix&, CPUMatrix&, double, ElementWiseOperator, ElementWiseOperator, + const array&, + const SmallVector&, const array, 4>&, + const SmallVector&, const array, 4>&) +{ + return false; +} + +template<> +bool CPUMatrixSpecialUnaryTensorOpImpl(half, const CPUMatrix&, CPUMatrix&, half, ElementWiseOperator, ElementWiseOperator, + const array&, + const SmallVector&, const array, 2>&, + const SmallVector&, const array, 2>&) +{ + return false; +} + +template<> +bool CPUMatrixSpecialBinaryTensorOpImpl(half, const CPUMatrix&, const CPUMatrix&, CPUMatrix&, half, ElementWiseOperator, ElementWiseOperator, + const array&, + const SmallVector&, const array, 3>&, + const SmallVector&, const array, 3>&) +{ + return false; +} + +template<> +bool CPUMatrixSpecialTernaryTensorOpImpl(half, const CPUMatrix&, const CPUMatrix&, const CPUMatrix&, CPUMatrix&, half, ElementWiseOperator, ElementWiseOperator, + const array&, + const SmallVector&, const array, 4>&, + const SmallVector&, const array, 4>&) +{ + return false; +} + +}}} + +#endif \ No newline at end of file diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp index a886e457556..bce3af6652f 100644 --- a/Source/Math/CPUSparseMatrix.cpp +++ b/Source/Math/CPUSparseMatrix.cpp @@ -20,6 +20,7 @@ #ifdef LEAKDETECT #include #endif +#include "half.hpp" #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this @@ -480,7 +481,7 @@ CPUSparseMatrix& CPUSparseMatrix::DoScatterColumnsOf(ElemTyp // TODO: Replace with std::exclusive_scan when we switch to C++17 for (size_t i = 1; i <= GetNumCols(); ++i) SecondaryIndexLocation()[i] = SecondaryIndexLocation()[i - 1] + columnElementCounts[i - 1]; - + size_t offset = a.SecondaryIndexLocation()[0]; // TODO: Does it make sense to parallelize this? for (long j = 0; j < numColsToWrite; j++) @@ -531,7 +532,7 @@ void CPUSparseMatrix::Print(const char* matrixName, ptrdiff_t /*rowSta fprintf(stderr, "\n"); j++; } - fprintf(stderr, "%d:%.f ", unCompressedIndex[i], dataBuffer[i]); + fprintf(stderr, "%d:%.f ", unCompressedIndex[i], (double)dataBuffer[i]); } fprintf(stderr, "\n"); } @@ -721,7 +722,7 @@ void CPUSparseMatrix::SetMatrixFromSBCFormat(const size_t* blockIds, c template ElemType* CPUSparseMatrix::Data() const { - return (Buffer() + + return (Buffer() + ((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0)); } @@ -810,7 +811,7 @@ template void CPUSparseMatrix::RequireSizeAndAllocate(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*= true*/) { RequireSize(numRows, numCols, numNZElemToReserve, matrixFormat, growOnly); - + size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1; bool reallocate = (GetSizeAllocated() < numNZElemToReserve || (GetSizeAllocated() > numNZElemToReserve && !growOnly) || GetCompIndexSize() < newCompIndexSize); @@ -964,7 +965,7 @@ class MultiplyDenseAndSparse{ else if ( denseTimesSparse && transposeA) denseVal = dense( innerIndex, outerIndexDense); else if (!denseTimesSparse && !transposeB) denseVal = dense( innerIndex, outerIndexDense); else if (!denseTimesSparse && transposeB) denseVal = dense(outerIndexDense, innerIndex); - + // Update matrix c. if (denseTimesSparse) @@ -1312,7 +1313,7 @@ void CPUSparseMatrix::InnerProduct(const CPUSparseMatrix& a, } // A helper method used in MomentumSGDUpdate and NesterovAcceleratedMomentumSGDUpdate. -// Modifies the smoothed gradients "c", as well as the current gradients "this" on which this method is invoked. +// Modifies the smoothed gradients "c", as well as the current gradients "this" on which this method is invoked. // Classic momentum (unitGainFactor == 1.0): // 1) c = momentum * c + this // Unit-gain momentum (unitGainFactor == 1.0 - momentum): @@ -1423,7 +1424,8 @@ ElemType CPUSparseMatrix::Adagrad(CPUMatrix& c, const bool n } template -void CPUSparseMatrix::AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp) +template +void CPUSparseMatrix::AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp) { size_t numColsNeeded = 2 * GetNumCols(); @@ -1441,9 +1443,9 @@ void CPUSparseMatrix::AdaDelta(CPUMatrix& c, CPUMatrix::AdaDelta(CPUMatrix& c, CPUMatrix::SumOfElements() const return sum; } +// specialization to RunTimeError for now due to omp implementation only support build-in type +template <> +half CPUSparseMatrix::FrobeniusNorm() const +{ + RuntimeError("half FrobeniusNorm not supported."); +} +template <> +half CPUSparseMatrix::SumOfElements() const +{ + RuntimeError("half SumOfElements not supported."); +} + template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us) { @@ -1763,8 +1777,54 @@ MATH_API File& operator>>(File& stream, CPUSparseMatrix& us) template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us); template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us); +template +MATH_API File& operator<<(File& stream, const CPUSparseMatrix& us) +{ + if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR) + NOT_IMPLEMENTED; + + stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); + stream << sizeof(ElemType); + stream << std::wstring(L"nnmatrix"); // Note this is needed for compatability, and could potentially be an empty string + + size_t nz, numRows, numCols; + size_t compressedSize = us.SecondaryIndexCount(); + int format = us.GetFormat(); + + stream << format << nz << numCols << numRows; + + if (nz > 0) + { + ElemType* dataBuffer = us.NzValues(); + CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation(); + CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation(); + + for (size_t i = 0; i < nz; ++i) + { + stream << dataBuffer[i]; + } + for (size_t i = 0; i < nz; ++i) + { + stream << unCompressedIndex[i]; + } + for (size_t i = 0; i < compressedSize; ++i) + { + stream << compressedIndex[i]; + } + } + stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT")); + + return stream; +} + template class CPUSparseMatrix; template class CPUSparseMatrix; +template class CPUSparseMatrix; + +// instantiate learner methods +template void CPUSparseMatrix::AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp); +template void CPUSparseMatrix::AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, double learningRate, double rho, double epsilon, int* timestamps, int currentTimestamp); +template void CPUSparseMatrix::AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp); // We use Matrix as the backing store for QuantizedMatrix // Let's explciitly instantiate the methods we need for that purpose diff --git a/Source/Math/CPUSparseMatrix.h b/Source/Math/CPUSparseMatrix.h index 122d4cc4598..88c93bed3cb 100644 --- a/Source/Math/CPUSparseMatrix.h +++ b/Source/Math/CPUSparseMatrix.h @@ -232,7 +232,9 @@ class MATH_API CPUSparseMatrix : public BaseMatrix public: void NormalGrad(CPUMatrix& c, const ElemType momentum, ElemType unitGainFactor); ElemType Adagrad(CPUMatrix& c, const bool needAveMultiplier); - void AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp); + + template + void AdaDelta(CPUMatrix& c, CPUMatrix& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp); public: CPUSparseMatrix& InplaceTruncateTop(const ElemType threshold); diff --git a/Source/Math/CntkBatchNormalization.cuh b/Source/Math/CntkBatchNormalization.cuh index 2a0e16dcf2f..a56011d2c38 100644 --- a/Source/Math/CntkBatchNormalization.cuh +++ b/Source/Math/CntkBatchNormalization.cuh @@ -1,23 +1,11 @@ // // Copyright (c) Microsoft. All rights reserved. +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #pragma once -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4100) // 'identifier': unreferenced formal parameter -#pragma warning(disable : 4127) // conditional expression is constant -#pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union -#pragma warning(disable : 4458) // declaration of 'identifier' hides class member -#pragma warning(disable : 4515) // 'namespace': namespace uses itself -#endif -#include -#ifdef _MSC_VER -#pragma warning(pop) -#endif - namespace Microsoft { namespace MSR { namespace CNTK { size_t RoundUpToMultiple(size_t n, size_t blockSize) @@ -40,7 +28,7 @@ cudaError_t GetLastCudaError() #endif return cudaSuccess; } - +/* template __device__ __forceinline__ void LoadValues(const T* src, T dst[U]) { @@ -48,9 +36,17 @@ __device__ __forceinline__ void LoadValues(const T* src, T dst[U]) for (int i = 0; i < U; i++) dst[i] = src[i]; } +*/ +template +__device__ __forceinline__ void LoadValues(const T1* src, T2 dst[U]) +{ +#pragma unroll + for (int i = 0; i < U; i++) + dst[i] = (T2)src[i]; +} template <> -__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2]) +__device__ __forceinline__ void LoadValues<2, float, float>(const float* src, float dst[2]) { // src must be aligned at 8 bytes boundary. assert(reinterpret_cast(src) % (sizeof(dst)) == 0); @@ -60,7 +56,7 @@ __device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst } template <> -__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4]) +__device__ __forceinline__ void LoadValues<4, float, float>(const float* src, float dst[4]) { // src must be aligned at 16 bytes boundary. assert(reinterpret_cast(src) % (sizeof(dst)) == 0); @@ -73,7 +69,7 @@ __device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst dst[2] = v.z; dst[3] = v.w; } - +/* template __device__ __forceinline__ void StoreValues(const T src[U], T* dst) { @@ -81,9 +77,17 @@ __device__ __forceinline__ void StoreValues(const T src[U], T* dst) for (int i = 0; i < U; i++) dst[i] = src[i]; } +*/ +template +__device__ __forceinline__ void StoreValues(const T1 src[U], T2* dst) +{ +#pragma unroll + for (int i = 0; i < U; i++) + dst[i] = (T2)src[i]; +} template <> -__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst) +__device__ __forceinline__ void StoreValues<2, float, float>(const float src[2], float* dst) { // dst must be aligned at 8 bytes boundary. assert(reinterpret_cast(dst) % (sizeof(src)) == 0); @@ -94,7 +98,7 @@ __device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* } template <> -__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst) +__device__ __forceinline__ void StoreValues<4, float, float>(const float src[4], float* dst) { // dst must be aligned at 16 bytes boundary. assert(reinterpret_cast(dst) % (sizeof(src)) == 0); @@ -107,12 +111,16 @@ __device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* } template -__device__ __forceinline__ T Shuffle(T input, int srcLane) +__device__ __forceinline__ T Shuffle(T input, int srcLane, unsigned int mask) { #ifdef __CUDA_ARCH__ // shfl is supported only on Kepler+ static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer."); +#if CUDA_VERSION >= 9000 + return cub::ShuffleIndex(input, srcLane, CUB_PTX_WARP_THREADS, mask); // Need cub > 1.7.0 +#else return cub::ShuffleIndex(input, srcLane); +#endif #else assert(false); return input; // keep compiler happy @@ -136,6 +144,15 @@ namespace Operations assert(::isfinite(a) && a > 0); return rsqrt(a); } + + __device__ half RSqrt(half a) + { +#if __CUDA_ARCH__ >= 600 + return hrsqrt(a); +#else + return __float2half(rsqrtf(__half2float(a))); +#endif + } } // This function is used to select correct unroll factor. @@ -151,6 +168,17 @@ void Call(size_t vectorSize, Targs... args) Func<1>::template Call(args...); } +template