Merge pull request #11 from trishume/synstats

Synstats example program
trishume · Jul 5, 2016 · b62ce67 · b62ce67
2 parents 8041454 + 45027fc
commit b62ce67
Show file tree

Hide file tree

Showing 4 changed files with 203 additions and 0 deletions.
diff --git a/Readme.md b/Readme.md
@@ -98,6 +98,35 @@ This way from the time the edit happens to the time the new colouring gets rende
 
 Any time the file is changed the latest cached state is found, the cache is cleared after that point, and a background job is started. Any already running jobs are stopped because they would be working on old state. This way you can just have one thread dedicated to highlighting that is always doing the most up-to-date work, or sleeping.
 
+## Examples Available
+
+There's a number of examples of programs that use `syntect` in the `examples` folder and some code outside the repo:
+
+- `syncat` prints a highlighted file to the terminal using 24-bit colour ANSI escape codes. It demonstrates a simple file highlighting workflow.
+- `synhtml` prints an HTML file that will display the highlighted code. Demonstrates how syntect could be used by web servers and static site generators.
+- `synstats` collects a bunch of statistics about the code in a folder. Includes basic things like line count but also fancier things like number of functions. Demonstrates how `syntect` can be used for code analysis as well as highlighting, as well as how to use the APIs to parse out the semantic tokenization.
+- [`faiyels`](https://github.com/trishume/faiyels) is a little code minimap visualizer I wrote that uses `syntect` for highlighting.
+
+Here's that stats that `synstats` extracts from `syntect`'s codebase (not including examples and test data) as of [this commit](https://github.com/trishume/syntect/commit/10baa6888f84ea4ae35c746526302a8ff4956eb1):
+```
+################## Stats ###################
+File count:                               19
+Total characters:                     155504
+
+Function count:                          165
+Type count (structs, enums, classes):     64
+
+Code lines (traditional SLOC):          2960
+Total lines (w/ comments & blanks):     4011
+Comment lines (comment but no code):     736
+Blank lines (lines-blank-comment):       315
+
+Lines with a documentation comment:      646
+Total words written in doc comments:    4734
+Total words written in all comments:    5145
+Characters of comment:                 41099
+```
+
 ## License and Acknowledgements
 
 Thanks to [Textmate 2](https://github.com/textmate/textmate) and @defuz's [sublimate](https://github.com/defuz/sublimate) for the existing open source code I used as inspiration and in the case of sublimate's `tmTheme` loader, copy-pasted. All code (including defuz's sublimate code) is released under the MIT license.
diff --git a/examples/synstats.rs b/examples/synstats.rs
@@ -0,0 +1,169 @@
+//! An example of using syntect for code analysis.
+//! Basically a fancy lines of code count program that works
+//! for all languages Sublime Text supports and also counts things
+//! like number of functions and number of types defined.
+//!
+//! Another thing it does that other line count programs can't always
+//! do is properly count comments in embedded syntaxes. For example
+//! JS, CSS and Ruby comments embedded in ERB files.
+extern crate syntect;
+extern crate walkdir;
+use syntect::parsing::{SyntaxSet, ParseState, ScopeStackOp, ScopeStack};
+use syntect::highlighting::{ScopeSelector, ScopeSelectors};
+use syntect::easy::{ScopeRegionIterator};
+
+use std::path::Path;
+use std::io::{BufRead, BufReader};
+use std::fs::File;
+use walkdir::{DirEntry, WalkDir, WalkDirIterator};
+use std::str::FromStr;
+
+#[derive(Debug)]
+struct Selectors {
+    comment: ScopeSelector,
+    doc_comment: ScopeSelectors,
+    function: ScopeSelector,
+    types: ScopeSelectors,
+}
+
+impl Default for Selectors {
+    fn default() -> Selectors {
+        Selectors {
+            comment: ScopeSelector::from_str("comment - comment.block.attribute").unwrap(),
+            doc_comment: ScopeSelectors::from_str("comment.line.documentation, comment.block.documentation").unwrap(),
+            function: ScopeSelector::from_str("entity.name.function").unwrap(),
+            types: ScopeSelectors::from_str("entity.name.class, entity.name.struct, entity.name.enum, entity.name.type").unwrap(),
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+struct Stats {
+    selectors: Selectors,
+    files: usize,
+    functions: usize,
+    types: usize,
+    lines: usize,
+    chars: usize,
+    code_lines: usize,
+    comment_lines: usize,
+    comment_chars: usize,
+    comment_words: usize,
+    doc_comment_lines: usize,
+    doc_comment_words: usize,
+}
+
+fn print_stats(stats: &Stats) {
+    println!("");
+    println!("################## Stats ###################");
+    println!("File count:                           {:>6}", stats.files);
+    println!("Total characters:                     {:>6}", stats.chars);
+    println!("");
+    println!("Function count:                       {:>6}", stats.functions);
+    println!("Type count (structs, enums, classes): {:>6}", stats.types);
+    println!("");
+    println!("Code lines (traditional SLOC):        {:>6}", stats.code_lines);
+    println!("Total lines (w/ comments & blanks):   {:>6}", stats.lines);
+    println!("Comment lines (comment but no code):  {:>6}", stats.comment_lines);
+    println!("Blank lines (lines-blank-comment):    {:>6}", stats.lines-stats.code_lines-stats.comment_lines);
+    println!("");
+    println!("Lines with a documentation comment:   {:>6}", stats.doc_comment_lines);
+    println!("Total words written in doc comments:  {:>6}", stats.doc_comment_words);
+    println!("Total words written in all comments:  {:>6}", stats.comment_words);
+    println!("Characters of comment:                {:>6}", stats.comment_chars);
+}
+
+fn is_ignored(entry: &DirEntry) -> bool {
+    entry.file_name()
+         .to_str()
+         .map(|s| s.starts_with(".") && s.len() > 1 || s.ends_with(".md"))
+         .unwrap_or(false)
+}
+
+fn count_line(ops: &[(usize, ScopeStackOp)], line: &str, stats: &mut Stats) {
+    stats.lines += 1;
+
+    let mut stack = ScopeStack::new();
+    let mut line_has_comment = false;
+    let mut line_has_doc_comment = false;
+    let mut line_has_code = false;
+    for (s, op) in ScopeRegionIterator::new(&ops, line) {
+        stack.apply(op);
+        if s.is_empty() { // in this case we don't care about blank tokens
+            continue;
+        }
+        if stats.selectors.comment.does_match(stack.as_slice()).is_some() {
+            let words = s.split_whitespace().filter(|w| w.chars().all(|c| c.is_alphanumeric() || c == '.' || c == '\'')).count();
+            if stats.selectors.doc_comment.does_match(stack.as_slice()).is_some() {
+                line_has_doc_comment = true;
+                stats.doc_comment_words += words;
+            }
+            stats.comment_chars += s.len();
+            stats.comment_words += words;
+            line_has_comment = true;
+        } else if !s.chars().all(|c| c.is_whitespace()) {
+            line_has_code = true;
+        }
+        if stats.selectors.function.does_match(stack.as_slice()).is_some() {
+            stats.functions += 1;
+        }
+        if stats.selectors.types.does_match(stack.as_slice()).is_some() {
+            stats.types += 1;
+        }
+    }
+    if line_has_comment && !line_has_code {
+        stats.comment_lines += 1;
+    }
+    if line_has_doc_comment {
+        stats.doc_comment_lines += 1;
+    }
+    if line_has_code {
+        stats.code_lines += 1;
+    }
+}
+
+fn count(ss: &SyntaxSet, path: &Path, stats: &mut Stats) {
+    let syntax = match ss.find_syntax_for_file(path).unwrap_or(None) {
+        Some(syntax) => syntax,
+        None => return
+    };
+    stats.files += 1;
+    let mut state = ParseState::new(syntax);
+
+    let f = File::open(path).unwrap();
+    let mut reader = BufReader::new(f);
+    let mut line = String::new();
+    while reader.read_line(&mut line).unwrap() > 0 {
+        {
+            let ops = state.parse_line(&line);
+            stats.chars += line.len();
+            count_line(&ops, &line, stats);
+        }
+        line.clear();
+    }
+}
+
+fn main() {
+    let ss = SyntaxSet::load_defaults_newlines(); // note we load the version with newlines
+
+    let args: Vec<String> = std::env::args().collect();
+    let path = if args.len() < 2 {
+        "."
+    } else {
+        &args[1]
+    };
+
+    println!("################## Files ###################");
+    let mut stats = Stats::default();
+    let walker = WalkDir::new(path).into_iter();
+    for entry in walker.filter_entry(|e| !is_ignored(e)) {
+        let entry = entry.unwrap();
+        if entry.file_type().is_file() {
+            println!("{}", entry.path().display());
+            count(&ss, entry.path(), &mut stats);
+        }
+    }
+
+    // println!("{:?}", stats);
+    print_stats(&stats);
+}
diff --git a/src/easy.rs b/src/easy.rs
@@ -122,6 +122,8 @@ impl<'a> HighlightFile<'a> {
 /// at the top of your `for` loop over this iterator. Now you have a substring of the line and the scope stack
 /// for that token.
 ///
+/// See the `synstats.rs` example for an example of using this iterator.
+///
 /// **Note:** This will often return empty regions, just `continue` after applying the op if you don't want them.
 #[derive(Debug)]
 pub struct ScopeRegionIterator<'a> {

diff --git a/src/parsing/scope.rs b/src/parsing/scope.rs
@@ -345,15 +345,18 @@ impl ScopeStack {
     }
 
     /// Return a slice of the scopes in this stack
+    #[inline]
     pub fn as_slice(&self) -> &[Scope] {
         &self.scopes[..]
     }
 
     /// Return the height/length of this stack
+    #[inline]
     pub fn len(&self) -> usize {
         self.scopes.len()
     }
 
+    #[inline]
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }