Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Adding GGReadabilityParser, planning to switch from shelling out to P…

…ython scripts
  • Loading branch information...
commit 91543fb5dbc131e6dd9115c7cc3d50a6d88cf858 1 parent c79f6cc
@ttscoff authored
Showing with 620 additions and 0 deletions.
  1. +51 −0 GGReadabilityParser.h
  2. +569 −0 GGReadabilityParser.m
View
51 GGReadabilityParser.h
@@ -0,0 +1,51 @@
+/*
+ Copyright (c) 2012 Curtis Hard - GeekyGoodness
+*/
+#import <Foundation/Foundation.h>
+
+typedef void (^GGReadabilityParserCompletionHandler)( NSString * content );
+typedef void (^GGReadabilityParserErrorHandler)( NSError * error );
+
+enum {
+ GGReadabilityParserOptionNone = -1,
+ GGReadabilityParserOptionRemoveHeader = 1 << 2,
+ GGReadabilityParserOptionRemoveHeaders = 1 << 3,
+ GGReadabilityParserOptionRemoveEmbeds = 1 << 4,
+ GGReadabilityParserOptionRemoveIFrames = 1 << 5,
+ GGReadabilityParserOptionRemoveDivs = 1 << 6,
+ GGReadabilityParserOptionRemoveImages = 1 << 7,
+ GGReadabilityParserOptionFixImages = 1 << 8,
+ GGReadabilityParserOptionFixLinks = 1 << 9,
+ GGReadabilityParserOptionClearStyles = 1 << 10,
+ GGReadabilityParserOptionClearLinkLists = 1 << 11
+};
+typedef NSInteger GGReadabilityParserOptions;
+
+@interface GGReadabilityParser : NSObject {
+
+ float loadProgress;
+
+@private
+ GGReadabilityParserErrorHandler errorHandler;
+ GGReadabilityParserCompletionHandler completionHandler;
+ GGReadabilityParserOptions options;
+ NSURL * URL;
+ long long dataLength;
+ NSMutableData * responseData;
+ NSURLConnection * URLConnection;
+ NSURLResponse * URLResponse;
+
+}
+
+@property ( nonatomic, assign ) float loadProgress;
+
+- (id)initWithURL:(NSURL *)aURL
+ options:(GGReadabilityParserOptions)parserOptions
+completionHandler:(GGReadabilityParserCompletionHandler)cHandler
+ errorHandler:(GGReadabilityParserErrorHandler)eHandler;
+
+- (void)cancel;
+- (void)render;
+- (void)renderWithString:(NSString *)string;
+
+@end
View
569 GGReadabilityParser.m
@@ -0,0 +1,569 @@
+/*
+ Copyright (c) 2012 Curtis Hard - GeekyGoodness
+*/
+
+#import "GGReadabilityParser.h"
+
+@interface GGReadabilityParser ( private )
+
+- (NSXMLElement *)findBaseLevelContent:(NSXMLElement *)element;
+- (NSInteger)scoreElement:(NSXMLElement *)element;
+
+@end
+
+@implementation GGReadabilityParser
+
+@synthesize loadProgress;
+
+- (void)dealloc
+{
+ [URL release], URL = nil;
+ [URLResponse release], URLResponse = nil;
+ [completionHandler release], completionHandler = nil;
+ [errorHandler release], errorHandler = nil;
+ [responseData release], responseData = nil;
+ [URLConnection release], URLConnection = nil;
+ [super dealloc];
+}
+
+- (id)initWithURL:(NSURL *)aURL
+ options:(GGReadabilityParserOptions)parserOptions
+completionHandler:(GGReadabilityParserCompletionHandler)cHandler
+ errorHandler:(GGReadabilityParserErrorHandler)eHandler
+{
+ if( ( self = [super init] ) != nil )
+ {
+ URL = [aURL retain];
+ options = parserOptions;
+ completionHandler = [cHandler copy];
+ errorHandler = [eHandler copy];
+ responseData = [[NSMutableData alloc] init];
+ [self setLoadProgress:.1];
+ }
+ return self;
+}
+
+- (void)cancel
+{
+ if( URLConnection != nil )
+ {
+ [URLConnection cancel];
+ }
+}
+
+- (void)errorOut
+{
+ dispatch_async( dispatch_get_main_queue(), ^(void)
+ {
+ NSString * errorString = @"Readability was unable to find any suitable content.";
+ NSError * error = [NSError errorWithDomain:@"com.geekygoodness.readability"
+ code:1
+ userInfo:[NSDictionary dictionaryWithObject:errorString
+ forKey:NSLocalizedDescriptionKey]];
+ errorHandler( error );
+ });
+}
+
+- (void)render
+{
+ // set up the url connection
+ URLConnection = [[NSURLConnection connectionWithRequest:[NSURLRequest requestWithURL:URL]
+ delegate:self] retain];
+ [URLConnection start];
+}
+
+#pragma mark NSURLConnectionDelegate
+
+- (void)connection:(NSURLConnection *)connection
+ didFailWithError:(NSError *)error
+{
+ errorHandler( error );
+}
+
+- (void)connection:(NSURLConnection *)connection
+didReceiveResponse:(NSURLResponse *)response
+{
+ URLResponse = [response retain];
+ dataLength = [response expectedContentLength];
+}
+
+- (void)connection:(NSURLConnection *)connection
+ didReceiveData:(NSData *)data
+{
+ [responseData appendData:data];
+
+ // now set up the percentage
+ float prog = ( fabs( (float)[responseData length] / (float)dataLength ) / 100000 ) + 0.1;
+ [self setLoadProgress:( prog >= 8.5 ? 8.5 : prog )];
+}
+
+- (void)connectionDidFinishLoading:(NSURLConnection *)connection
+{
+ // async please
+ dispatch_async( dispatch_get_global_queue( DISPATCH_QUEUE_PRIORITY_DEFAULT, 0 ), ^(void)
+ {
+ NSString * str = nil;
+
+ // encodings to try
+ NSInteger encodings[4] = {
+ NSUTF8StringEncoding,
+ NSMacOSRomanStringEncoding,
+ NSASCIIStringEncoding,
+ NSUTF16StringEncoding
+ };
+
+ NSInteger i = 0;
+ // some sites might not be UTF8, so try until nil
+ for( i; i < sizeof( encodings ) / sizeof( NSInteger ); i++ )
+ {
+ if( ( str = [[[NSString alloc] initWithData:responseData
+ encoding:encodings[i]] autorelease] ) != nil )
+ {
+ break;
+ }
+ }
+
+ // if cant convert data to a string, just die
+ if( str == nil )
+ {
+ [self errorOut];
+ return;
+ }
+
+ // render
+ [self renderWithString:str];
+ });
+}
+
+- (void)renderWithString:(NSString *)string
+{
+
+ // if the main thread, send to an async thread instead
+ if( [NSThread currentThread] == [NSThread mainThread] )
+ {
+ dispatch_async( dispatch_get_global_queue( DISPATCH_QUEUE_PRIORITY_HIGH, 0 ), ^(void){
+ [self renderWithString:string];
+ });
+ return;
+ }
+
+ NSError * error = nil; // we dont actually pay attention to this
+
+ NSInteger types[2] = {
+ NSXMLDocumentTidyHTML,
+ NSXMLDocumentTidyXML
+ };
+
+ NSXMLDocument * XML = nil;
+ NSXMLElement * theEl = nil;
+
+ // different types, html, xml
+ BOOL OKToGo = NO;
+ NSInteger i = 0;
+ for(i; i < sizeof( types ) / sizeof( NSInteger ); i++ )
+ {
+ XML = [[[NSXMLDocument alloc] initWithXMLString:string
+ options:types[i]
+ error:&error] autorelease];
+
+ // find the body tag
+ NSXMLElement * el = [[XML nodesForXPath:@"//body"
+ error:&error] lastObject];
+
+ // is there a child count?
+ if( [el childCount] != 0 )
+ {
+ theEl = el;
+ OKToGo = YES;
+ break;
+ }
+ }
+
+ // error out if no xml
+ if( ! OKToGo )
+ {
+ [self errorOut];
+ return;
+ }
+
+ // let the fun begin
+ NSXMLElement * element = [self findBaseLevelContent:theEl];
+
+ if( ! element )
+ {
+ // we tried :-(
+ [self errorOut];
+ return;
+ }
+
+ // now we have the base element to work with, lets remove all div's that dont have a parent of a p
+
+ NSMutableArray * elementsToRemove = [[[NSMutableArray alloc] init] autorelease];
+
+ // remove divs
+ if( options & GGReadabilityParserOptionRemoveDivs )
+ {
+ [elementsToRemove addObject:@"div"];
+ }
+
+ // remove embeds
+ if( options & GGReadabilityParserOptionRemoveEmbeds )
+ {
+ [elementsToRemove addObject:@"embed"];
+ [elementsToRemove addObject:@"object"];
+ }
+
+ // remove iframes
+ if( options & GGReadabilityParserOptionRemoveIFrames )
+ {
+ [elementsToRemove addObject:@"iframe"];
+ }
+
+ // remove images
+ if( options & GGReadabilityParserOptionRemoveImages )
+ {
+ [elementsToRemove addObject:@"img"];
+ }
+
+ // remove headers
+ if( options & GGReadabilityParserOptionRemoveHeader )
+ {
+ [elementsToRemove addObject:@"h1"];
+ }
+
+ // remove more headers
+ if( options & GGReadabilityParserOptionRemoveHeaders )
+ {
+ NSInteger i;
+ for( i = 2; i <= 6; i++ )
+ {
+ [elementsToRemove addObject:[NSString stringWithFormat:@"h%ld",i]];
+ }
+ }
+
+ // remove any tags specified
+ for( NSString * tagToRemove in elementsToRemove )
+ {
+ NSArray * removeElements = [element nodesForXPath:[NSString stringWithFormat:@"//%@",tagToRemove]
+ error:&error];
+ for( NSXMLElement * removeEl in removeElements )
+ {
+ [(NSXMLElement *)[removeEl parent] removeChildAtIndex:[removeEl index]];
+ }
+ }
+
+ // remove any styles
+ if( options & GGReadabilityParserOptionClearStyles )
+ {
+ NSArray * cleanArray = [element nodesForXPath:@"//*[@style]"
+ error:&error];
+ for( NSXMLElement * cleanElement in cleanArray )
+ {
+ [cleanElement removeAttributeForName:@"style"];
+ }
+ }
+
+ // clear link lists
+ if( options & GGReadabilityParserOptionClearLinkLists )
+ {
+ NSArray * lookFor = [NSArray arrayWithObjects:@"similar",@"bookmark",@"links",@"social",@"nav",@"comments",@"comment",@"date",@"author",@"time",@"cat",@"related", nil];
+ NSArray * allElements = [element nodesForXPath:@"//*"
+ error:&error];
+ for( NSXMLElement * theElement in allElements )
+ {
+ // grab the ids
+ NSArray * idNames = [[[theElement attributeForName:@"id"] stringValue] componentsSeparatedByString:@" "];
+
+ // and class names
+ NSArray * classNames = [[[theElement attributeForName:@"class"] stringValue] componentsSeparatedByString:@" "];
+
+ BOOL killElement = NO;
+ for( NSString * idName in idNames )
+ {
+ for( NSString * matchAgainst in lookFor )
+ {
+ if( [idName rangeOfString:matchAgainst].length != 0 )
+ {
+ killElement = YES;
+ break;
+ }
+ }
+ if( killElement )
+ {
+ break;
+ }
+ }
+
+ if( killElement )
+ {
+ [(NSXMLElement *)[theElement parent] removeChildAtIndex:[theElement index]];
+ continue;
+ }
+
+ // now class names
+ for( NSString * className in classNames )
+ {
+ for( NSString * matchAgainst in lookFor )
+ {
+ if( [className rangeOfString:matchAgainst].length != 0 )
+ {
+ killElement = YES;
+ break;
+ }
+ }
+ if( killElement )
+ {
+ break;
+ }
+ }
+
+ // if kill element, remove it!
+ if( killElement )
+ {
+ [(NSXMLElement *)[theElement parent] removeChildAtIndex:[theElement index]];
+ }
+
+ }
+ }
+
+ // do we need to fix the links or the images
+ [elementsToRemove removeAllObjects];
+
+ // img tags
+ if( options & GGReadabilityParserOptionFixImages )
+ {
+ [elementsToRemove addObject:[NSDictionary dictionaryWithObjectsAndKeys:@"img",@"tagName",@"src",@"attributeName",nil]];
+ }
+
+ // a tags
+ if( options & GGReadabilityParserOptionFixLinks )
+ {
+ [elementsToRemove addObject:[NSDictionary dictionaryWithObjectsAndKeys:@"a",@"tagName",@"href",@"attributeName",nil]];
+ }
+
+ // ignore the name, just easy to reuse
+ NSString * baseURL = [NSString stringWithFormat:@"%@://%@",[[URLResponse URL] scheme],[[URLResponse URL] host]];
+ for( NSDictionary * dict in elementsToRemove )
+ {
+ // grab the elements
+ NSArray * els = [element nodesForXPath:[NSString stringWithFormat:@"//%@",[dict objectForKey:@"tagName"]]
+ error:&error];
+ for( NSXMLElement * fixEl in els )
+ {
+ NSXMLNode * attribute = [fixEl attributeForName:[dict objectForKey:@"attributeName"]];
+ if( [[attribute stringValue] length] != 0 &&
+ [[[attribute stringValue] substringToIndex:1] isEqualToString:@"/"] )
+ {
+ // needs fixing
+ NSString * newAttributeString = [NSString stringWithFormat:@"%@%@",baseURL,[attribute stringValue]];
+ [attribute setStringValue:newAttributeString];
+ }
+ }
+ }
+
+ // were done!
+
+ NSData * data = [[element XMLString] dataUsingEncoding:NSUTF8StringEncoding
+ allowLossyConversion:YES];
+
+ NSString * returnContents = [[[NSString alloc] initWithData:data
+ encoding:NSUTF8StringEncoding] autorelease];
+
+ // tell our handler :-)
+ dispatch_async( dispatch_get_main_queue(), ^(void)
+ {
+ [self setLoadProgress:1.0];
+ if( [returnContents length] == 0 )
+ {
+ [self errorOut];
+ return;
+ }
+ completionHandler( returnContents );
+ });
+}
+
+- (NSXMLElement *)findBaseLevelContent:(NSXMLElement *)element
+{
+ NSError * error = nil; // again, we dont actually care
+ // generally speaking, the content lies within ptags - we hope
+
+ // clean up the element
+ NSArray * toRemove = [NSArray arrayWithObjects:@"noscript",@"script",@"form", nil];
+ for( NSString * removeTag in toRemove )
+ {
+ // find them all
+ NSArray * removeArray = [element nodesForXPath:[NSString stringWithFormat:@"//%@",removeTag]
+ error:&error];
+ for( NSXMLElement * removeElement in removeArray )
+ {
+ [(NSXMLElement *)[removeElement parent] removeChildAtIndex:[removeElement index]];
+ }
+ }
+
+ // basic instant wins
+ NSArray * instantWins = [NSArray arrayWithObjects:@"article-body", nil];
+
+ NSInteger pCount = 0;
+ NSXMLElement * foundElement = nil;
+
+ for( NSString * instantWinName in instantWins )
+ {
+ NSArray * nodes = [element nodesForXPath:[NSString stringWithFormat:@"//*[contains(@class,'%@') or contains(@id,'%@')]", instantWinName, instantWinName]
+ error:&error];
+ if( [nodes count] != 0 )
+ {
+ for( NSXMLElement * winElement in nodes )
+ {
+ NSInteger count = [[winElement nodesForXPath:@"//p"
+ error:&error] count];
+ if( count > pCount )
+ {
+ pCount = count;
+ foundElement = winElement;
+ }
+ }
+ }
+ }
+
+ // we found a winning match!
+ if( foundElement != nil )
+ {
+ return foundElement;
+ }
+
+ NSArray * tags = [element nodesForXPath:@"//p"
+ error:&error];
+
+ NSInteger currentCount = 0;
+ NSXMLElement * tagParent = nil;
+ for( NSXMLElement * tag in tags )
+ {
+ NSXMLElement * parent = (NSXMLElement *)[tag parent];
+
+ // count how many p tags are inside the parent
+ NSInteger parentTagsCount = [[parent nodesForXPath:@"p"
+ error:&error] count];
+ if( parentTagsCount > currentCount )
+ {
+ currentCount = parentTagsCount;
+ tagParent = parent;
+ }
+ }
+
+ // old school br tags ( people still do this? :-( )
+ BOOL usingBR = NO;
+ if( tagParent == nil )
+ {
+ // try old school br tags
+ currentCount = 0;
+ usingBR = YES;
+ tags = [element nodesForXPath:@"//br"
+ error:&error];
+ for( NSXMLElement * tag in tags )
+ {
+ NSXMLElement * parent = (NSXMLElement *)[tag parent];
+
+ // count how many br tags there are
+ NSInteger parentTagsCount = [[parent nodesForXPath:@"br"
+ error:&error] count];
+ parentTagsCount += [self scoreElement:parent];
+ if( parentTagsCount > currentCount )
+ {
+ currentCount = parentTagsCount;
+ tagParent = parent;
+ }
+ }
+ }
+
+ // current br count
+ if( usingBR && tagParent )
+ {
+ NSInteger textChildren = 0;
+ NSInteger brs = 0;
+ for( NSXMLElement * el in [tagParent children] )
+ {
+ if( [el kind] == NSXMLTextKind )
+ {
+ textChildren++;
+ } else if ( [[[el name] lowercaseString] isEqualToString:@"br"] ) {
+ brs++;
+ }
+ }
+
+ // whats the ratio?
+ if( textChildren < ( brs / 2 ) )
+ {
+ tagParent = nil;
+ } else {
+ // remove any br tags directly next to each other
+ NSArray * brs = [tagParent nodesForXPath:@"//br[preceding-sibling::br[1]]"
+ error:&error];
+ for( NSXMLElement * br in brs )
+ {
+ [(NSXMLElement *)[br parent] removeChildAtIndex:[br index]];
+ }
+ }
+
+ }
+ // if nothing is found, lets try something else...
+ if( tagParent == nil )
+ {
+
+ // now were going to find and find the content, because either they dont use ptags or its just horrible markup
+
+ NSArray * elements = [element nodesForXPath:@"//*"
+ error:&error];
+
+ NSMutableDictionary * scoreDict = [[[NSMutableDictionary alloc] init] autorelease];
+
+ NSXMLElement * currentElement = nil;
+
+ // grab everything that has it within class or id
+ for( NSXMLElement * el in elements )
+ {
+ // grab its hash
+ NSInteger score = [scoreDict objectForKey:el] ? [[scoreDict objectForKey:el] integerValue] : 0;
+ score += [self scoreElement:el];
+
+ // store it within a dict
+ [scoreDict setObject:[NSNumber numberWithInteger:score]
+ forKey:el];
+ }
+
+ // set the parent tag
+ tagParent = currentElement;
+
+ }
+
+ return tagParent;
+}
+
+- (NSInteger)scoreElement:(NSXMLElement *)element
+{
+ // these are key words that will probably be inside the class or id of the element that houses the content
+ NSArray * scores = [NSArray arrayWithObjects:@"post",@"entry",@"content",@"text",@"article",@"story",@"blog", nil];
+ NSInteger score = 0;
+ for( NSString * possitiveWord in scores )
+ {
+ score += [[[element name] lowercaseString] isEqualToString:possitiveWord] ? 150 : 0;
+
+ // grab the class names and id names
+ NSArray * classNames = [[[element attributeForName:@"class"] stringValue] componentsSeparatedByString:@" "];
+ NSArray * idNames = [[[element attributeForName:@"id"] stringValue] componentsSeparatedByString:@" "];
+
+ // match against the possitive class
+ for( NSString * className in classNames )
+ {
+ score += [className rangeOfString:possitiveWord].length != 0 ? 20 : 0;
+ }
+
+ // match against the possitive id
+ for( NSString * idName in idNames )
+ {
+ score += [idName rangeOfString:possitiveWord].length != 0 ? 30 : 0;
+ }
+ }
+ return score;
+}
+
+@end
Please sign in to comment.
Something went wrong with that request. Please try again.