first commit

yssk22 · Aug 14, 2010 · 677b1de · 677b1de
commit 677b1de
Show file tree

Hide file tree

Showing 13 changed files with 2,488 additions and 0 deletions.
diff --git a/History.md b/History.md
@@ -0,0 +1,5 @@
+
+0.0.1 / YYYY-MM-DD
+------------------
+
+* Initial release
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+(The MIT License)
+
+Copyright (c) 2010 Yohei Sasaki &lt;yssk22@gmail.com&gt;
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Readme.md b/Readme.md
@@ -0,0 +1,23 @@
+
+# extractcontent
+
+This module extracts title and main contents from an HTML text.
+
+Algorithm is ported from the original implementation in Ruby
+
+* http://rubyforge.org/projects/extractcontent/
+* http://labs.cybozu.co.jp/blog/nakatani/2007/09/web_1.html
+
+## Usage
+
+    var ex = require('extractcontent')
+    ex.extractFromUrl('http://yssk22.blogspot.com/', function(error, result){ 
+       console.log(result.title); 
+       // -> Relaxed in Japan.
+       console.log(result.content); 
+       // -> last week ... 
+    });
+
+## Install
+
+    npm install extractcontent
diff --git a/lib/extractcontent.js b/lib/extractcontent.js
@@ -0,0 +1,26 @@
+var path = require('path');
+var extractor = require(path.join(__dirname, 'extractor')),
+    fetcher = require(path.join(__dirname, 'fetcher'));
+
+exports.extractFromUrl = function(urlStr, callback , options){
+   fetcher.fetchUrl(urlStr, function(error, code, header, content){
+      if(error){
+         callback(error, null);
+      }else{
+         _extract(content, callback, options);
+      }
+   });
+};
+
+exports.extractFromText = function(text, callback, options){
+   _extract(content, callback, options);
+};
+
+function _extract(content, callback, options){
+   try{
+      var result = extractor.extract(content, options);
+      callback(false, result);
+   }catch(e){
+      callback(e, null);
+   }
+};
diff --git a/lib/extractor.js b/lib/extractor.js
@@ -0,0 +1,195 @@
+/*
+ * ported from http://rubyforge.org/projects/extractcontent/
+ */
+
+var DEFAULT_OPTS = {
+   threshold : 100,
+   min_length : 80,
+   decay_factor : 0.73,
+   continuous_factor : 1.62,
+   punctuation_weight : 10,
+   punctuations : /([、。，．！？]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/i,
+   waste_expressions : /Copyright|All Rights Reserved/i,
+   debug : false
+};
+
+SKIP_ANALYSIS = /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i;
+GOOGLE_ADSENSE_IGNORE = /<!--\s*google_ad_section_start\(weight=ignore\)\s*-->.*?<!--\s*google_ad_section_end.*?-->/mg;
+
+exports.extract = function(html, options){
+   if(html.match(SKIP_ANALYSIS)){
+      return  {'title': extract_title(html), 'content':''};
+   }
+
+   options = options || {};
+   for(k in DEFAULT_OPTS){
+      if(options[k] == undefined){
+         options[k] = DEFAULT_OPTS[k];
+      }
+   }
+   var title = undefined;
+   var content = undefined;
+   if(html.match(/<\/head\s*>/im)){
+      title = extract_title(RegExp.leftContext);
+      if( title != ''){
+         html = RegExp.rightContext;
+      }else{
+         title = extract_title(html);
+      }
+   }else{
+      title = extract_title(html);
+   }
+
+   html = html.replace(GOOGLE_ADSENSE_IGNORE, '');
+   if(html.match(/<!--\s*google_ad_section_start[^>]*-->/)){
+      //html.match(/<!--\s*google_ad_section_start[^>]*-->[\s\S]*<!--\s*google_ad_section_end.*?-->/img);
+      html = scan(html, /<!--\s*google_ad_section_start[^>]*-->[\s\S]*<!--\s*google_ad_section_end.*?-->/img).join("\n");
+   }
+   html = eliminate_useless_tags(html);
+
+   html = html.replace(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/ig, function(m){
+      if( RegExp.$2.length >= 3 && title.indexOf(RegExp.$2) >= 0){
+         return "<div>" + RegExp.$2 + "</div>";
+      }else{
+         return RegExp.$2;
+      }
+   });
+
+   var factor = 1.0, continuous = 1.0,
+       body = '', score = 0, bodylist = [];
+   var list = html.split(/<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/);
+   for(var i in list){
+      var block = list[i];
+      if( !block ){ continue; }
+
+      block = strip(block);
+      if( has_only_tags(block) ){ continue; }
+
+      if( body.length > 0 ){
+         continuous = continuous / options['continuous_factor'];
+      }
+      var notlinked = eliminate_link(block);
+      if( notlinked.length < options['min_length']){
+         continue;
+      }
+
+      var c = (notlinked.length + scan(notlinked, options.punctuations).length * options.punctuation_weight) * factor;
+      factor = factor * options.decay_factor;
+      var not_body_rate =
+         scan(block, options.waste_expressions).length +
+         scan(block, /amazon[a-z0-9\.\/\-\?&]+-22/ig).length / 2.0;
+
+      if(not_body_rate > 0){
+         c = c + Math.pow(0.72, not_body_rate);
+      }
+      var c1 = c * continuous;
+      if( c1 > options['threshold']){
+         body = body + block + '\n';
+         score = score + c1;
+         continuous = options.continuous_factor;
+      }else if(c > options['threshold']){
+         bodylist.push([body, score]);
+         body = body + block + '\n';
+         score = c;
+         continuous = options.continuous_factor;
+      }
+      bodylist.push(body, score);
+   }
+   var s = 0;
+
+   bodylist.forEach(function(a){
+      if(s < a[1]){
+         content = a[0];
+         s = a[1];
+      }
+   });
+
+   return {
+      'title' : title,
+      'content' : strip(strip_tags(content))
+   };
+};
+
+function extract_title(st){
+   if( st.match(/<title[^>]*>\s*(.*?)\s<\/title\s*>/im) ){
+      return RegExp.$1;
+   }else{
+      if( st.match(/<h\d[^>]*>\s*(.*?)\s*<\/h\d\s*>/im)){
+         return strip_tags(RegExp.$1);
+      }else {
+         return '';
+      }
+   }
+}
+
+function eliminate_useless_tags(html){
+   html.replace(/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/img, '');
+   html.replace(/<!--.*?-->/mg, '');
+   html.replace(/<![A-Za-z].*?>/g, '');
+   html.replace(/<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>.*?<\/div\s*>/igm, '');
+   html.replace(/<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/ig, '');
+   return html;
+}
+
+function eliminate_link(html){
+   var count = 0;
+   var notlinked = html.replace(/<a\s[^>]*>.*?<\/a\s*>/img, function(a){
+      count = count +1;
+      return '';
+   });
+   notlinked = notlinked.replace(/<form\s[^>]*>.*?<\/form\s*>/img, '');
+   notlinked = strip_tags(notlinked);
+   if(notlinked.length < 20 * count || islinklist(html)){
+      return '';
+   }else{
+      return notlinked;
+   }
+}
+
+function islinklist(st){
+   if(st.match(/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)/img)){
+      var listpart = RegExp.$1;
+      var outside = st.replace(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/img, '').replace(/<.+?>/img, '').replace(/\s+/, ' ');
+      var list = listpart.split(/<li[^>]*>/);
+      list.shift();
+      var r = evaluate_list(list);
+      return outside.length <= st.length / (45/rate);
+   }
+   return null;
+}
+
+function evaluate_list(list){
+   if(list.length == 0){
+      return true;
+   }
+   var hit = 0;
+   list.forEach(function(line){
+      if(line.match(/<a\s+href=(['"]?)([^"'\s]+)\1/img)){
+         hit = hit + 1;
+      }
+   });
+   var a = (1.0 * hit / list.length);
+   return 9 * (a * a) + 1;
+}
+
+function has_only_tags(st){
+   st = strip(st.replace(/<[^>]*>/img, '').replace(/\&nbsp\;/img, ''));
+   return st.length == 0;
+}
+
+function strip(st){
+   return st.replace(/^\s*([\s\S]*?)\s*$/g, "$1");
+}
+
+function scan(st, regex){
+   return st.match(regex) || [];
+}
+
+function strip_tags(html){
+   if( html ){
+      var st = html.toString();
+      return st.replace(/<\/?[^>]+>/img, '');
+   }else{
+      return '';
+   }
+}
diff --git a/lib/fetcher.js b/lib/fetcher.js
@@ -0,0 +1,49 @@
+var http = require('http'),
+    url = require('url');
+
+MAX_REDIRECTION = 5;
+
+// TODO how the best practice to define exceptions?
+var errors = {
+   InvalidUrlError : function(){},
+   TooManyRedirectionError : function(){}
+};
+
+exports.errors = errors;
+exports.fetchUrl = function(urlStr, callback){
+   _fetchUrl(urlStr, 0, callback);
+};
+
+function _fetchUrl(_urlStr, redirection, _callback){
+   if( redirection == MAX_REDIRECTION ){
+      _callback(new TooManyRedirectionError());
+   }
+
+   var target = url.parse(_urlStr);
+   if( target.host === undefined ){
+      _callback(new exports.errors.InvalidUrlError());
+   }
+   var client = http.createClient(target.port || 80, target.hostname,
+                                  target.protocol == 'https:');
+   var request = client.request('GET', target.pathname,
+                                {'host': target.hostname });
+   request.end();
+   request.on("response", function(response){
+      if(300 <= response.statusCode && response.statusCode < 400 ){
+         var location = response.headers.location;
+         _fetchUrl(location, redirection + 1, _callback);
+      }else {
+         var code   = response.statusCode;
+         var headers = response.headers;
+         var content = '';
+         response.on('data', function(chunk){
+            content += chunk;
+         });
+         response.on('end', function(){
+            _callback(code >= 400,
+                      code, headers, content,
+                      redirection > 0 ? _urlStr : null);
+         });
+      }
+   });
+};
diff --git a/package.json b/package.json
@@ -0,0 +1,19 @@
+{
+   "name" : "extractcontent"
+,  "version" : "0.1.0"
+,  "description" : "Utility for extracting title and main contents from an HTML text."
+,  "tags" : ["text-analysis", "node", "utility"]
+,  "author" : "Yohei Sasaki <yssk22@gmail.com>"
+,  "repository" : 
+   {  "type" : "git"
+   ,  "url" : "http://github.com/yssk22/extractcontent.git"
+   }
+, "bugs" : { "web" : "http://github.com/yssk22/extractcontent/issues" }
+, "licenses" :
+  [ { "type" : "MIT"
+    , "url" : "http://github.com/yssk22/extractcontent/raw/master/LICENSE"
+    }
+  ]
+, "main" : "./lib/extractcontent"
+
+}