diff --git a/CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs b/CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs index 0184b25..98a9d03 100644 --- a/CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs +++ b/CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs @@ -6,6 +6,8 @@ using System.Collections.Generic; using System.Linq; using System.Text; +using System.Diagnostics; + namespace Extractor { @@ -27,12 +29,14 @@ public class Extractor public int WidthLimit { get; set; } public string Code { get; set; } public bool ShouldHash { get; set; } + public int MaxContexts { get; set; } - public Extractor(string code, Options opts) + public Extractor(string code, Options opts) { LengthLimit = opts.MaxLength; WidthLimit = opts.MaxWidth; ShouldHash = !opts.NoHash; + MaxContexts = opts.MaxContexts; Code = code; } @@ -104,29 +108,32 @@ private string PathToString(PathFinder.Path path) return builder.ToString(); } - internal IEnumerable GetInternalPaths(Tree tree) - { + internal IEnumerable GetInternalPaths(Tree tree) + { var finder = new PathFinder(tree, LengthLimit, WidthLimit); - foreach (Tuple varPair in - Utilities.WeakConcat(Utilities.Choose2(variables), - variables.Select((arg) => new Tuple(arg,arg)))) - { - bool pathToSelf = varPair.Item1 == varPair.Item2; - - foreach(var lhs in varPair.Item1.Leaves) - foreach (var rhs in varPair.Item2.Leaves) - { - if (lhs == rhs) - continue; - - PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true); - - if (path == null) - continue; - - - yield return path; - } + + var allPairs = Utilities.ReservoirSample(Utilities.WeakConcat(Utilities.Choose2(variables), + variables.Select((arg) => new Tuple(arg, arg))), MaxContexts); + + //iterate over variable-variable pairs + foreach (Tuple varPair in allPairs) + { + bool pathToSelf = varPair.Item1 == varPair.Item2; + + foreach (var rhs in varPair.Item2.Leaves) + foreach (var lhs in varPair.Item1.Leaves) + { + + if (lhs == rhs) + continue; + + PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true); + + if (path == null) + continue; + + yield return path; + } } } @@ -167,6 +174,7 @@ public List Extract() List results = new List(); foreach(var method in methods) { + String methodName = method.Identifier.ValueText; Tree methodTree = new Tree(method); var subtokensMethodName = Utilities.SplitToSubtokens(methodName); @@ -185,10 +193,12 @@ public List Extract() foreach (PathFinder.Path path in GetInternalPaths(methodTree)) { - contexts.Add(SplitNameUnlessEmpty(tokenToVar[path.Left].Name) + String pathString = SplitNameUnlessEmpty(tokenToVar[path.Left].Name) + "," + MaybeHash(this.PathNodesToString(path)) - + "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name)); + + "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name); + Debug.WriteLine(path.Left.FullSpan+" "+tokenToVar[path.Left].Name+ "," +this.PathNodesToString(path)+ "," + tokenToVar[path.Right].Name+" "+path.Right.FullSpan); + contexts.Add(pathString); } var commentNodes = tree.GetRoot().DescendantTrivia().Where( @@ -206,7 +216,7 @@ public List Extract() contexts.Add(batch + "," + "COMMENT" + "," + batch); } } - results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts)); + results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts)); } return results; } diff --git a/CSharpExtractor/CSharpExtractor/Extractor/Program.cs b/CSharpExtractor/CSharpExtractor/Extractor/Program.cs index c383301..d3b51d3 100644 --- a/CSharpExtractor/CSharpExtractor/Extractor/Program.cs +++ b/CSharpExtractor/CSharpExtractor/Extractor/Program.cs @@ -44,9 +44,12 @@ static void Main(string[] args) results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options)); - foreach (var res in results) + using (StreamWriter sw = new StreamWriter(options.OFileName, append: true)) { - Console.WriteLine(res); + foreach (var res in results) + { + sw.WriteLine(res); + } } } } diff --git a/CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs b/CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs index 4604fc9..666eec1 100644 --- a/CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs +++ b/CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Linq; using System.Text; +using System.Diagnostics; using System.Text.RegularExpressions; namespace Extractor @@ -21,11 +22,17 @@ public class Options [Option('l', "max_width", Default = 2, HelpText = "Max path length")] public int MaxWidth { get; set; } + [Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")] + public String OFileName { get; set; } + [Option('h', "no_hash", Default = false, HelpText = "When enabled, prints the whole path strings (not hashed)")] public Boolean NoHash { get; set; } + + [Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")] + public int MaxContexts { get; set; } } - public class Utilities + public static class Utilities { public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" }; public static IEnumerable> Choose2(IEnumerable enumerable) @@ -40,7 +47,41 @@ public static IEnumerable> Choose2(IEnumerable enumerable) } } - public static IEnumerable WeakConcat(IEnumerable enumerable1, IEnumerable enumerable2) + /// + /// Sample uniform randomly numSamples from an enumerable, using reservoir sampling. + /// See https://en.wikipedia.org/wiki/Reservoir_sampling + /// + /// + /// + /// + /// + public static IEnumerable ReservoirSample(this IEnumerable input, int numSamples) + { + var rng = new Random(); + var sampledElements = new List(numSamples); + int seenElementCount = 0; + foreach (var element in input) + { + seenElementCount++; + if (sampledElements.Count < numSamples) + { + sampledElements.Add(element); + } + else + { + int position = rng.Next(seenElementCount); + if (position < numSamples) + { + sampledElements[position] = element; + } + } + } + Debug.Assert(sampledElements.Count <= numSamples); + return sampledElements; + } + + + public static IEnumerable WeakConcat(IEnumerable enumerable1, IEnumerable enumerable2) { foreach (T t in enumerable1) yield return t; diff --git a/CSharpExtractor/CSharpExtractor/Extractor/Variable.cs b/CSharpExtractor/CSharpExtractor/Extractor/Variable.cs index 0fe8ae3..ff94959 100644 --- a/CSharpExtractor/CSharpExtractor/Extractor/Variable.cs +++ b/CSharpExtractor/CSharpExtractor/Extractor/Variable.cs @@ -94,8 +94,15 @@ internal static IEnumerable CreateFromMethod(Tree methodTree) string name = tokenToName[leaf]; SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray(); var v = new Variable(name, syntaxTokens, methodTree); - results.Add(v); - } + + //check if exists + var matches = results.Where(p => p.Name == name).ToList(); + bool alreadyExists = (matches.Count != 0); + if (!alreadyExists) + { + results.Add(v); + } + } return results; } diff --git a/CSharpExtractor/extract.py b/CSharpExtractor/extract.py index 104a881..f7a5d98 100644 --- a/CSharpExtractor/extract.py +++ b/CSharpExtractor/extract.py @@ -27,35 +27,30 @@ def ParallelExtractDir(args, dir): def ExtractFeaturesForDir(args, dir, prefix): command = ['dotnet', 'run', '--project', args.csproj, '--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width), - '--path', dir, '--threads', str(args.num_threads)] + '--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)] + # print command # os.system(command) kill = lambda process: process.kill() - outputFileName = TMP_DIR + prefix + dir.split('/')[-1] - failed = False - with open(outputFileName, 'a') as outputFile: - sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE) - timer = Timer(600000, kill, [sleeper]) - - try: - timer.start() - stdout, stderr = sleeper.communicate() - finally: - timer.cancel() - - if sleeper.poll() == 0: - if len(stderr) > 0: - print(sys.stderr, stderr, file=sys.stdout) - else: - print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time', file=sys.stdout) - failed = True - subdirs = get_immediate_subdirectories(dir) - for subdir in subdirs: - ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') - if failed: - if os.path.exists(outputFileName): - os.remove(outputFileName) + sleeper = subprocess.Popen(command, stderr=subprocess.PIPE) + timer = Timer(600000, kill, [sleeper]) + + try: + timer.start() + _, stderr = sleeper.communicate() + finally: + timer.cancel() + + if sleeper.poll() == 0: + if len(stderr) > 0: + print(sys.stderr, stderr) + else: + print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time') + failed = True + subdirs = get_immediate_subdirectories(dir) + for subdir in subdirs: + ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') def ExtractFeaturesForDirsList(args, dirs): @@ -77,12 +72,14 @@ def ExtractFeaturesForDirsList(args, dirs): if __name__ == '__main__': + parser = ArgumentParser() parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8) parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2) parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64) parser.add_argument("--csproj", dest="csproj", required=True) parser.add_argument("-dir", "--dir", dest="dir", required=False) + parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True) args = parser.parse_args() if args.dir is not None: @@ -91,5 +88,3 @@ def ExtractFeaturesForDirsList(args, dirs): if len(subdirs) == 0: to_extract = [args.dir.rstrip('/')] ExtractFeaturesForDirsList(args, to_extract) - - diff --git a/preprocess_csharp.sh b/preprocess_csharp.sh index 54ec490..4a43105 100644 --- a/preprocess_csharp.sh +++ b/preprocess_csharp.sh @@ -39,13 +39,13 @@ mkdir -p data mkdir -p data/${DATASET_NAME} echo "Extracting paths from validation set..." -${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${VAL_DATA_FILE} +${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${VAL_DATA_FILE} echo "Finished extracting paths from validation set" echo "Extracting paths from test set..." -${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} +${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TEST_DATA_FILE} echo "Finished extracting paths from test set" echo "Extracting paths from training set..." -${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TRAIN_DATA_FILE} +${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TRAIN_DATA_FILE} echo "Finished extracting paths from training set" TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2v