Skip to content
This repository has been archived by the owner on Sep 28, 2018. It is now read-only.

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
yssk22 committed Aug 14, 2010
0 parents commit 677b1de
Show file tree
Hide file tree
Showing 13 changed files with 2,488 additions and 0 deletions.
5 changes: 5 additions & 0 deletions History.md
@@ -0,0 +1,5 @@

0.0.1 / YYYY-MM-DD
------------------

* Initial release
22 changes: 22 additions & 0 deletions LICENSE
@@ -0,0 +1,22 @@
(The MIT License)

Copyright (c) 2010 Yohei Sasaki <yssk22@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
'Software'), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 changes: 23 additions & 0 deletions Readme.md
@@ -0,0 +1,23 @@

# extractcontent

This module extracts title and main contents from an HTML text.

Algorithm is ported from the original implementation in Ruby

* http://rubyforge.org/projects/extractcontent/
* http://labs.cybozu.co.jp/blog/nakatani/2007/09/web_1.html

## Usage

var ex = require('extractcontent')
ex.extractFromUrl('http://yssk22.blogspot.com/', function(error, result){
console.log(result.title);
// -> Relaxed in Japan.
console.log(result.content);
// -> last week ...
});

## Install

npm install extractcontent
26 changes: 26 additions & 0 deletions lib/extractcontent.js
@@ -0,0 +1,26 @@
var path = require('path');
var extractor = require(path.join(__dirname, 'extractor')),
fetcher = require(path.join(__dirname, 'fetcher'));

exports.extractFromUrl = function(urlStr, callback , options){
fetcher.fetchUrl(urlStr, function(error, code, header, content){
if(error){
callback(error, null);
}else{
_extract(content, callback, options);
}
});
};

exports.extractFromText = function(text, callback, options){
_extract(content, callback, options);
};

function _extract(content, callback, options){
try{
var result = extractor.extract(content, options);
callback(false, result);
}catch(e){
callback(e, null);
}
};
195 changes: 195 additions & 0 deletions lib/extractor.js
@@ -0,0 +1,195 @@
/*
* ported from http://rubyforge.org/projects/extractcontent/
*/

var DEFAULT_OPTS = {
threshold : 100,
min_length : 80,
decay_factor : 0.73,
continuous_factor : 1.62,
punctuation_weight : 10,
punctuations : /([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/i,
waste_expressions : /Copyright|All Rights Reserved/i,
debug : false
};

SKIP_ANALYSIS = /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i;
GOOGLE_ADSENSE_IGNORE = /<!--\s*google_ad_section_start\(weight=ignore\)\s*-->.*?<!--\s*google_ad_section_end.*?-->/mg;

exports.extract = function(html, options){
if(html.match(SKIP_ANALYSIS)){
return {'title': extract_title(html), 'content':''};
}

options = options || {};
for(k in DEFAULT_OPTS){
if(options[k] == undefined){
options[k] = DEFAULT_OPTS[k];
}
}
var title = undefined;
var content = undefined;
if(html.match(/<\/head\s*>/im)){
title = extract_title(RegExp.leftContext);
if( title != ''){
html = RegExp.rightContext;
}else{
title = extract_title(html);
}
}else{
title = extract_title(html);
}

html = html.replace(GOOGLE_ADSENSE_IGNORE, '');
if(html.match(/<!--\s*google_ad_section_start[^>]*-->/)){
//html.match(/<!--\s*google_ad_section_start[^>]*-->[\s\S]*<!--\s*google_ad_section_end.*?-->/img);
html = scan(html, /<!--\s*google_ad_section_start[^>]*-->[\s\S]*<!--\s*google_ad_section_end.*?-->/img).join("\n");
}
html = eliminate_useless_tags(html);

html = html.replace(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/ig, function(m){
if( RegExp.$2.length >= 3 && title.indexOf(RegExp.$2) >= 0){
return "<div>" + RegExp.$2 + "</div>";
}else{
return RegExp.$2;
}
});

var factor = 1.0, continuous = 1.0,
body = '', score = 0, bodylist = [];
var list = html.split(/<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/);
for(var i in list){
var block = list[i];
if( !block ){ continue; }

block = strip(block);
if( has_only_tags(block) ){ continue; }

if( body.length > 0 ){
continuous = continuous / options['continuous_factor'];
}
var notlinked = eliminate_link(block);
if( notlinked.length < options['min_length']){
continue;
}

var c = (notlinked.length + scan(notlinked, options.punctuations).length * options.punctuation_weight) * factor;
factor = factor * options.decay_factor;
var not_body_rate =
scan(block, options.waste_expressions).length +
scan(block, /amazon[a-z0-9\.\/\-\?&]+-22/ig).length / 2.0;

if(not_body_rate > 0){
c = c + Math.pow(0.72, not_body_rate);
}
var c1 = c * continuous;
if( c1 > options['threshold']){
body = body + block + '\n';
score = score + c1;
continuous = options.continuous_factor;
}else if(c > options['threshold']){
bodylist.push([body, score]);
body = body + block + '\n';
score = c;
continuous = options.continuous_factor;
}
bodylist.push(body, score);
}
var s = 0;

bodylist.forEach(function(a){
if(s < a[1]){
content = a[0];
s = a[1];
}
});

return {
'title' : title,
'content' : strip(strip_tags(content))
};
};

function extract_title(st){
if( st.match(/<title[^>]*>\s*(.*?)\s<\/title\s*>/im) ){
return RegExp.$1;
}else{
if( st.match(/<h\d[^>]*>\s*(.*?)\s*<\/h\d\s*>/im)){
return strip_tags(RegExp.$1);
}else {
return '';
}
}
}

function eliminate_useless_tags(html){
html.replace(/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/img, '');
html.replace(/<!--.*?-->/mg, '');
html.replace(/<![A-Za-z].*?>/g, '');
html.replace(/<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>.*?<\/div\s*>/igm, '');
html.replace(/<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/ig, '');
return html;
}

function eliminate_link(html){
var count = 0;
var notlinked = html.replace(/<a\s[^>]*>.*?<\/a\s*>/img, function(a){
count = count +1;
return '';
});
notlinked = notlinked.replace(/<form\s[^>]*>.*?<\/form\s*>/img, '');
notlinked = strip_tags(notlinked);
if(notlinked.length < 20 * count || islinklist(html)){
return '';
}else{
return notlinked;
}
}

function islinklist(st){
if(st.match(/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)/img)){
var listpart = RegExp.$1;
var outside = st.replace(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/img, '').replace(/<.+?>/img, '').replace(/\s+/, ' ');
var list = listpart.split(/<li[^>]*>/);
list.shift();
var r = evaluate_list(list);
return outside.length <= st.length / (45/rate);
}
return null;
}

function evaluate_list(list){
if(list.length == 0){
return true;
}
var hit = 0;
list.forEach(function(line){
if(line.match(/<a\s+href=(['"]?)([^"'\s]+)\1/img)){
hit = hit + 1;
}
});
var a = (1.0 * hit / list.length);
return 9 * (a * a) + 1;
}

function has_only_tags(st){
st = strip(st.replace(/<[^>]*>/img, '').replace(/\&nbsp\;/img, ''));
return st.length == 0;
}

function strip(st){
return st.replace(/^\s*([\s\S]*?)\s*$/g, "$1");
}

function scan(st, regex){
return st.match(regex) || [];
}

function strip_tags(html){
if( html ){
var st = html.toString();
return st.replace(/<\/?[^>]+>/img, '');
}else{
return '';
}
}
49 changes: 49 additions & 0 deletions lib/fetcher.js
@@ -0,0 +1,49 @@
var http = require('http'),
url = require('url');

MAX_REDIRECTION = 5;

// TODO how the best practice to define exceptions?
var errors = {
InvalidUrlError : function(){},
TooManyRedirectionError : function(){}
};

exports.errors = errors;
exports.fetchUrl = function(urlStr, callback){
_fetchUrl(urlStr, 0, callback);
};

function _fetchUrl(_urlStr, redirection, _callback){
if( redirection == MAX_REDIRECTION ){
_callback(new TooManyRedirectionError());
}

var target = url.parse(_urlStr);
if( target.host === undefined ){
_callback(new exports.errors.InvalidUrlError());
}
var client = http.createClient(target.port || 80, target.hostname,
target.protocol == 'https:');
var request = client.request('GET', target.pathname,
{'host': target.hostname });
request.end();
request.on("response", function(response){
if(300 <= response.statusCode && response.statusCode < 400 ){
var location = response.headers.location;
_fetchUrl(location, redirection + 1, _callback);
}else {
var code = response.statusCode;
var headers = response.headers;
var content = '';
response.on('data', function(chunk){
content += chunk;
});
response.on('end', function(){
_callback(code >= 400,
code, headers, content,
redirection > 0 ? _urlStr : null);
});
}
});
};
19 changes: 19 additions & 0 deletions package.json
@@ -0,0 +1,19 @@
{
"name" : "extractcontent"
, "version" : "0.1.0"
, "description" : "Utility for extracting title and main contents from an HTML text."
, "tags" : ["text-analysis", "node", "utility"]
, "author" : "Yohei Sasaki <yssk22@gmail.com>"
, "repository" :
{ "type" : "git"
, "url" : "http://github.com/yssk22/extractcontent.git"
}
, "bugs" : { "web" : "http://github.com/yssk22/extractcontent/issues" }
, "licenses" :
[ { "type" : "MIT"
, "url" : "http://github.com/yssk22/extractcontent/raw/master/LICENSE"
}
]
, "main" : "./lib/extractcontent"

}

0 comments on commit 677b1de

Please sign in to comment.