In [1]:
import nltk
import urllib
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import re
from datetime import datetime, date, time

In [2]:
class Blog(object):
    # constants in the class
    STOPWORDS = stopwords.words('english') + [x.title() for x in stopwords.words('english')] + [x.upper() for x in stopwords.words('english')]
    ADS_SRC = "https://no-cache.hubspot.com/cta/default/3858309/d5cac7dd-b452-4aaa-b9c6-ded225e29405.png"
    
    def __init__(self, url):
        self.url = url
    
    def parse(self):
        # parses the html url into a beautiful soup object
        soup = BeautifulSoup(urllib.urlopen(self.url).read(), 'html.parser')
        return soup
    
    def display_html(self):
        # displays the bs object in clean html format
        print self.parse().prettify()
        
    def title(self):
        # returns the title of the blog article
        return self.parse().find('h1').get_text()
    
    def sub_title(self):
        # displays all the sub-title of the blog article (this is a list)
        return [item.get_text() for item in self.parse().find_all('h2')]
    
    def num_sub_title(self):
        # counts the number of subtitles
        return len(self.sub_title())
    
    def author(self):
        # returns the author of the blog article
        return self.parse().find("a", {"class" : "author"}).get_text()
    
    @staticmethod
    def date_format(d):
        d = d[10:]
        d = d[:3] + ' ' + re.sub('[a-zA-Z]', '', d[4:])
        return datetime.strptime(d, "%b %d, %Y").date()

    def post_date(self):
        # returns the post date of the blog article
        return Blog.date_format(self.parse().find("span", {"class" : "authordate"}).get_text())
    
    def break_into_paragraph(self, ads_src = ADS_SRC):
        # returns a list where each element is a paragraph of the blog article
        all_paragraphs = self.parse().find("div", {"class": "section post-body"}).find_all('p')
        paragraphs = []
        for item in all_paragraphs:
            if item.find('a') == None:
                paragraphs.append(item)
            elif item.find('a').find('img') == None:
                paragraphs.append(item)
            elif item.find('a').find('img')['src'] != ads_src:
                paragraphs.append(item)
        return [x.get_text() for x in paragraphs]
    
    def num_paragraph(self):
        # returns the number of paragraphs
        return len(self.break_into_paragraph())
    
    def blog_body(self):
        # displays the body of the blog as an article
        return '\n'.join(self.break_into_paragraph())

    @staticmethod
    def count_word_freq(corpus, rm_stop_words = False, stopwords = STOPWORDS):
        # returns the word count frequency of a part of the article
        # can do word count for any part with function self.count_word_freq(self.part, True)
        to_tokenize = corpus
        tokens = [t.lower() for t in nltk.word_tokenize(to_tokenize) if re.match('^(?=.*[a-zA-Z]|[0-9])', t)]
        if not rm_stop_words:
            freq = nltk.FreqDist(tokens)
        else:
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stopwords:
                    clean_tokens.remove(token)
            freq = nltk.FreqDist(clean_tokens)
        return freq
    
    @staticmethod
    def word_count(corpus, rm_stop_words = False):
        # returns the actual word count of a part of the article
        # it would be a list, with first element being total word count, second element being unique word count
        return [sum([value for key, value in Blog.count_word_freq(corpus, rm_stop_words).items()]),
                len([key for key, value in Blog.count_word_freq(corpus, rm_stop_words).items()])]
    
    def title_word_freq(self, rm_stop_words = False):
        # returns the word count frequency of title of the article
        return self.count_word_freq(self.title(), rm_stop_words)
    
    def body_word_freq(self, rm_stop_words = False):
        # returns the word count frequency of body of the article
        return self.count_word_freq(self.blog_body(), rm_stop_words)
    
    def title_word_count(self, rm_stop_words = False):
        # returns the word count of a title
        return self.word_count(self.title(), rm_stop_words)
    
    def body_word_count(self, rm_stop_words = False):
        # returns the word count of a title
        return self.word_count(self.blog_body(), rm_stop_words)
    
    @staticmethod
    def avg_word_count(corpus_list, rm_stop_words = False):
        # counts the average number of word in a list of a corpus
        # could count any avg word count through function self.avg_word_count(self.sub_title())
        # or access subtitle, paragraph average count using the following functions
        lens = [Blog.word_count(x, rm_stop_words)[0] for x in corpus_list]
        return round(float(sum(lens))/len(lens), 1)
        
    def avg_word_count_sub_title(self, rm_stop_words = False):
        # counts the average number of word in sub titles
        return Blog.avg_word_count(self.sub_title(), rm_stop_words)

    def avg_word_count_paragraph(self, rm_stop_words = False):
        # count the average number of word in each paragraph
        return Blog.avg_word_count(self.break_into_paragraph(), rm_stop_words)
    
    @staticmethod
    def break_into_sentence(corpus):
        # breaks a corpus into sentences
        return(sent_tokenize(corpus))
    
    @staticmethod
    def sentence_count(corpus):
        # counts the number of sentences in a corpus
        return len(Blog.break_into_sentence(corpus))
    
    def sentence_count_body(self):
        # counts the total number of sentences in the blog article
        return Blog.sentence_count(self.blog_body())
    
    def avg_sentence_count_paragraph(self):
        # counts the average number of sentence in each paragraph
        lens = [Blog.sentence_count(x) for x in self.break_into_paragraph()]
        return round(float(sum(lens))/len(lens), 1)
    
    def avg_word_count_in_sentence(self, rm_stop_words = False):
        # counts the average number of word in each sentence
        return Blog.avg_word_count(Blog.break_into_sentence(test.blog_body()), rm_stop_words)
    
    def num_graphics(self, ads_src = ADS_SRC):
        # counts the images and graphics in the blog article
        all_img = self.parse().find("div", {"class": "section post-body"}).find_all('img')
        return len([x for x in all_img if x['src'] != ads_src])
    
    def num_links(self, ads_src = ADS_SRC):
        # counts the hyper text links in the blog article
        all_links = self.parse().find("div", {"class": "section post-body"}).find_all('a')
        counter = 0
        for link in all_links:
            if link.find('img') == None:
                counter = counter + 1
            elif link.find('img')['src'] != ads_src:
                counter = counter + 1
        return counter
    
    @staticmethod
    def consolidate_dictionary(d):
        # returns a new dictionary consodidating the key values into the broader categories that we care about
        new_d = {'verb': 0, 'noun': 0, 'adj': 0, 'adv': 0}
        for key, value in d.items():
            if key in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                new_d['verb'] = new_d['verb'] + d[key]
            elif key in ['RB', 'RBR', 'RBS']:
                new_d['adv'] = new_d['adv'] + d[key]
            elif key in ['JJ', 'JJR', 'JJS']:
                new_d['adj'] = new_d['adj'] + d[key]
            elif key in ['NN', 'NNP', 'NNPS', 'NNS']:
                new_d['noun'] = new_d['noun'] + d[key]
            else:
                new_d[key] = d[key]

        return new_d

    @staticmethod
    def word_type(corpus, rm_stop_words = False, stopwords = STOPWORDS):
        # returns the word type frequency count of the corpus
        # check the details of the classification here nltk.help.upenn_tagset()
        if not rm_stop_words:
            tokens = [t for t in nltk.word_tokenize(corpus) if re.match('^(?=.*[a-zA-Z]|[0-9])', t)]
        else:
            tokens = [t for t in nltk.word_tokenize(corpus) if (re.match('^(?=.*[a-zA-Z]|[0-9])', t) and t not in stopwords)]
        result = nltk.pos_tag(tokens)
        word_class_dict = {}
        for x in result:
            if x[1] not in word_class_dict:
                word_class_dict[x[1]] = 1
            else:
                word_class_dict[x[1]] = word_class_dict[x[1]] + 1
        new_d = Blog.consolidate_dictionary(word_class_dict)
        return new_d
    
    @staticmethod
    def word_type_percentage(corpus, rm_stop_words = False, stopwords = STOPWORDS):
        # returns the word type frequency percentage of the corpus
        word_class = Blog.word_type(corpus, rm_stop_words)
        total_words = sum([v for k, v in word_class.items()])
        new_dict = {k:round(float(v)/total_words, 2) for k, v in word_class.items()}
        return new_dict
    
    def word_type_percentage_title(self, word_type = 'noun', rm_stop_words = False):
        # returns the percentage of different types of words in title
        # verb', 'noun', 'adj', 'adv', 'wrb' are the types that we care about
        # if needs to return the distribution of each word type, use command:
        # self.word_type_percentage(self.title())
        pct_dict = self.word_type_percentage(self.title(), rm_stop_words)
        return pct_dict[word_type]
    
    def word_type_percentage_body(self, word_type = 'noun', rm_stop_words = False):
        # returns the percentage of different types of words in title
        # verb', 'noun', 'adj', 'adv', 'wrb' are the types that we care about
        # if needs to return the distribution of each word type, use command:
        # self.word_type_percentage(self.blog_body())
        pct_dict = self.word_type_percentage(self.blog_body(), rm_stop_words)
        return pct_dict[word_type]
    
    def key_words(self, rm_stop_words = True):
        # returns the first 5 most frequent nouns in the article:
        freq_dict = self.body_word_freq(rm_stop_words)
        sorted_freq = sorted([(k, v) for k, v in freq_dict.items()], 
                             reverse = True, 
                            key = lambda x: x[1])
        return sorted_freq[:10]
        
    

In [12]:
#test with article
# article_url = 'https://blog.aurorasolar.com/expert-qa-why-solar-panel-recycling-matters-and-how-it-can-benefit-the-industry'
# test = Blog(article_url)
# print test.key_words()
# test.title()

In [13]:
# test for functions
# print test.title()
# print test.sub_title()
# print test.num_sub_title()
# print test.author()
# for x in test.break_into_paragraph():
#     print x
#     print '\n'
# print test.blog_body()
# print test.num_paragraph()
# print test.title_word_freq(False)
# print test.title_word_count(True)
# print test.body_word_count(True)
# print test.avg_word_count_sub_title()
# print test.avg_word_count_paragraph()
# print test.break_into_sentence(test.break_into_paragraph()[0])
# print test.sentence_count(test.break_into_paragraph()[0])
# print test.sentence_count(test.blog_body())
# print test.sentence_count_body()
# print test.avg_sentence_count_paragraph()
# print test.avg_word_count_in_sentence()
# print test.num_graphics()
# print test.num_links()
# print test.post_date()
# print test.title()
# print test.word_type_percentage(test.title())
# print test.word_type_percentage(test.blog_body())
# print test.word_type_percentage_title('adj')
# print test.word_type_percentage_body()