Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 36 additions & 26 deletions CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;


namespace Extractor
{
Expand All @@ -27,12 +29,14 @@ public class Extractor
public int WidthLimit { get; set; }
public string Code { get; set; }
public bool ShouldHash { get; set; }
public int MaxContexts { get; set; }

public Extractor(string code, Options opts)
public Extractor(string code, Options opts)
{
LengthLimit = opts.MaxLength;
WidthLimit = opts.MaxWidth;
ShouldHash = !opts.NoHash;
MaxContexts = opts.MaxContexts;
Code = code;
}

Expand Down Expand Up @@ -104,29 +108,32 @@ private string PathToString(PathFinder.Path path)
return builder.ToString();
}

internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
{
internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
{
var finder = new PathFinder(tree, LengthLimit, WidthLimit);
foreach (Tuple<Variable, Variable> varPair in
Utilities.WeakConcat(Utilities.Choose2(variables),
variables.Select((arg) => new Tuple<Variable,Variable>(arg,arg))))
{
bool pathToSelf = varPair.Item1 == varPair.Item2;

foreach(var lhs in varPair.Item1.Leaves)
foreach (var rhs in varPair.Item2.Leaves)
{
if (lhs == rhs)
continue;

PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);

if (path == null)
continue;


yield return path;
}

var allPairs = Utilities.ReservoirSample(Utilities.WeakConcat(Utilities.Choose2(variables),
variables.Select((arg) => new Tuple<Variable, Variable>(arg, arg))), MaxContexts);

//iterate over variable-variable pairs
foreach (Tuple<Variable, Variable> varPair in allPairs)
{
bool pathToSelf = varPair.Item1 == varPair.Item2;

foreach (var rhs in varPair.Item2.Leaves)
foreach (var lhs in varPair.Item1.Leaves)
{

if (lhs == rhs)
continue;

PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);

if (path == null)
continue;

yield return path;
}
}
}

Expand Down Expand Up @@ -167,6 +174,7 @@ public List<String> Extract()
List<String> results = new List<string>();

foreach(var method in methods) {

String methodName = method.Identifier.ValueText;
Tree methodTree = new Tree(method);
var subtokensMethodName = Utilities.SplitToSubtokens(methodName);
Expand All @@ -185,10 +193,12 @@ public List<String> Extract()

foreach (PathFinder.Path path in GetInternalPaths(methodTree))
{
contexts.Add(SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
String pathString = SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
+ "," + MaybeHash(this.PathNodesToString(path))
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name));
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name);

Debug.WriteLine(path.Left.FullSpan+" "+tokenToVar[path.Left].Name+ "," +this.PathNodesToString(path)+ "," + tokenToVar[path.Right].Name+" "+path.Right.FullSpan);
contexts.Add(pathString);
}

var commentNodes = tree.GetRoot().DescendantTrivia().Where(
Expand All @@ -206,7 +216,7 @@ public List<String> Extract()
contexts.Add(batch + "," + "COMMENT" + "," + batch);
}
}
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
}
return results;
}
Expand Down
7 changes: 5 additions & 2 deletions CSharpExtractor/CSharpExtractor/Extractor/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,12 @@ static void Main(string[] args)

results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options));

foreach (var res in results)
using (StreamWriter sw = new StreamWriter(options.OFileName, append: true))
{
Console.WriteLine(res);
foreach (var res in results)
{
sw.WriteLine(res);
}
}
}
}
Expand Down
45 changes: 43 additions & 2 deletions CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
using System.Text.RegularExpressions;

namespace Extractor
Expand All @@ -21,11 +22,17 @@ public class Options
[Option('l', "max_width", Default = 2, HelpText = "Max path length")]
public int MaxWidth { get; set; }

[Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")]
public String OFileName { get; set; }

[Option('h', "no_hash", Default = false, HelpText = "When enabled, prints the whole path strings (not hashed)")]
public Boolean NoHash { get; set; }

[Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")]
public int MaxContexts { get; set; }
}

public class Utilities
public static class Utilities
{
public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" };
public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
Expand All @@ -40,7 +47,41 @@ public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
}
}

public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
/// <summary>
/// Sample uniform randomly numSamples from an enumerable, using reservoir sampling.
/// See https://en.wikipedia.org/wiki/Reservoir_sampling
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="input"></param>
/// <param name="numSamples"></param>
/// <returns></returns>
public static IEnumerable<TSource> ReservoirSample<TSource>(this IEnumerable<TSource> input, int numSamples)
{
var rng = new Random();
var sampledElements = new List<TSource>(numSamples);
int seenElementCount = 0;
foreach (var element in input)
{
seenElementCount++;
if (sampledElements.Count < numSamples)
{
sampledElements.Add(element);
}
else
{
int position = rng.Next(seenElementCount);
if (position < numSamples)
{
sampledElements[position] = element;
}
}
}
Debug.Assert(sampledElements.Count <= numSamples);
return sampledElements;
}


public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
{
foreach (T t in enumerable1)
yield return t;
Expand Down
11 changes: 9 additions & 2 deletions CSharpExtractor/CSharpExtractor/Extractor/Variable.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,15 @@ internal static IEnumerable<Variable> CreateFromMethod(Tree methodTree)
string name = tokenToName[leaf];
SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray();
var v = new Variable(name, syntaxTokens, methodTree);
results.Add(v);
}

//check if exists
var matches = results.Where(p => p.Name == name).ToList();
bool alreadyExists = (matches.Count != 0);
if (!alreadyExists)
{
results.Add(v);
}
}

return results;
}
Expand Down
49 changes: 22 additions & 27 deletions CSharpExtractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,30 @@ def ParallelExtractDir(args, dir):
def ExtractFeaturesForDir(args, dir, prefix):
command = ['dotnet', 'run', '--project', args.csproj,
'--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
'--path', dir, '--threads', str(args.num_threads)]
'--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]


# print command
# os.system(command)
kill = lambda process: process.kill()
outputFileName = TMP_DIR + prefix + dir.split('/')[-1]
failed = False
with open(outputFileName, 'a') as outputFile:
sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE)
timer = Timer(600000, kill, [sleeper])

try:
timer.start()
stdout, stderr = sleeper.communicate()
finally:
timer.cancel()

if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr, file=sys.stdout)
else:
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time', file=sys.stdout)
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
if failed:
if os.path.exists(outputFileName):
os.remove(outputFileName)
sleeper = subprocess.Popen(command, stderr=subprocess.PIPE)
timer = Timer(600000, kill, [sleeper])

try:
timer.start()
_, stderr = sleeper.communicate()
finally:
timer.cancel()

if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr)
else:
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')


def ExtractFeaturesForDirsList(args, dirs):
Expand All @@ -77,12 +72,14 @@ def ExtractFeaturesForDirsList(args, dirs):


if __name__ == '__main__':

parser = ArgumentParser()
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
parser.add_argument("--csproj", dest="csproj", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
args = parser.parse_args()

if args.dir is not None:
Expand All @@ -91,5 +88,3 @@ def ExtractFeaturesForDirsList(args, dirs):
if len(subdirs) == 0:
to_extract = [args.dir.rstrip('/')]
ExtractFeaturesForDirsList(args, to_extract)


6 changes: 3 additions & 3 deletions preprocess_csharp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ mkdir -p data
mkdir -p data/${DATASET_NAME}

echo "Extracting paths from validation set..."
${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${VAL_DATA_FILE}
${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${VAL_DATA_FILE}
echo "Finished extracting paths from validation set"
echo "Extracting paths from test set..."
${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TEST_DATA_FILE}
${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TEST_DATA_FILE}
echo "Finished extracting paths from test set"
echo "Extracting paths from training set..."
${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TRAIN_DATA_FILE}
${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TRAIN_DATA_FILE}
echo "Finished extracting paths from training set"

TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2v
Expand Down