In [1]:
# This may get a bit hefty for Notepad, but we'll see.
# We'll control the counts from here:
article_count = 50
word_count    = 25
num_k         = 7

In [2]:
from urllib.request import urlopen
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from bs4 import BeautifulSoup
import re

In [3]:
ignore_list = set( stopwords.words('english')
                  + list(punctuation) 
                  + ['’',"'s","'it","'the","‘","'i","n't",'“','”','–','–','•','…','—'] 
                  + ['i','we','one','two','1','2','3'])

In [4]:
def extract_articles( link_prefix, link_list, extract_function, source_name, count ):
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            body_paragraphs = extract_function( link_prefix, article_href )
            if not body_paragraphs:
                raise ValueError('No body content found')
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( word_count, word_freq, 0.9, 0.1 )
            
            article_words.append({ 'source':source_name, \
                                   'words': tuple(top_words), \
                                   'href': article_href, \
                                   'text': article_text })
            processed += 1
            
        except:
            #raise
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

In [5]:
def count_words( text, ignore_list ):
    word_freq = defaultdict(int)
    for word in word_tokenize(text):
        word_lc = word.lower();
        if word_lc not in ignore_list and word not in ignore_list:
            word_freq[ word_lc ] += 1
            #if word_freq[word_lc] > 2:
                #print('       word at {:3d}: {}'.format(word_freq[word_lc], word_lc) )
    return word_freq

In [6]:
def top_frequencies( n, freq_list, max_cut, min_cut ):
    freq_edit = dict( freq_list )
    max_freq  = float( max(freq_list.values()) )
    
    for word in freq_list.keys():
        word_freq = freq_list[word] / max_freq
        if word_freq >= max_cut or word_freq <= min_cut:
            del freq_edit[ word ]
    #print( freq_edit )
    return nlargest( n, freq_edit, key=freq_edit.get )

In [7]:
def process_links( article_links, count ):
    unique_links = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    use_total    = min( count, unique_links.__len__() )
    print( "%d unique links found, trying to use using %d" % (unique_links.__len__(), use_total) )
    return list( unique_links )    

In [8]:
def soup_up( url ):
    home_content  = urlopen( url ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    return home_soup

In [9]:
def crawlDailyMail( count, wordcount, ignore_list ):
    home_soup     = soup_up('http://www.dailymail.co.uk/home/index.html');
    article_links = [ a['href'] for a in home_soup.find_all('a', {'href': re.compile('/news/article-')}) ]
    link_list     = process_links( article_links, count )
    
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', {'class':'mol-para-with-font'}, recursive=False) for a in article_body ]
    
    return extract_articles(  'http://www.dailymail.co.uk', link_list, extract_content, 'daily mail', count )
        
print("Crawling the DM")
dm_vectors = crawlDailyMail( article_count, word_count, ignore_list )
#print( dm_vectors )

Crawling the DM
132 unique links found, trying to use using 50
    1: /news/article-5293055/Las-Vegas-shooters-deadly-arsenal-revealed.html
    2: /news/article-5292825/Cabinet-ministers-told-China-make-money-Brexit.html
    3: /news/article-5292821/New-safety-sheriff-hunt-companies-selling-unsafe-goods.html
    4: /news/article-5293289/BBC-Girl-staff-hired-Rohingya-Muslims-extras.html
    5: /news/article-5293087/Protestors-carried-SAVAGE-signs-2018-Womens-Marches.html
    6: /news/article-5292531/Gazza-seen-enjoying-Leicester-City-vs-Watford-match.html
    7: /news/article-5279521/Brisbane-Lucy-Cottier-reveals-life-multiple-personalities.html
    8: /news/article-5292927/Autism-sufferers-given-Blue-Badges.html
    9: /news/article-5292753/Ukip-leaders-former-mistress-posted-Islamophobic-messages.html
   10: /news/article-5293851/Britain-freezes-coldest-night-TWO-years.html
   11: /news/article-5290299/Why-DJ-Kid-Jensen-revealed-Parkinsons.html
   12: /news/article-5291849/Students-pr

In [10]:
def crawlGuardian( count, wordcount, ignore_list ):
    home_soup     = soup_up('https://www.theguardian.com/uk-news');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/\d{4}/[a-z]{3}/\d{1,2}/') }) ]
    link_list     = process_links( article_links, count )
    
    link_list = list(map( lambda href: href.replace('https://www.theguardian.com', ''), link_list ))
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', recursive=False) for a in article_body ]
    
    return extract_articles( 'https://www.theguardian.com', link_list, extract_content, 'guardian', count )
    
print("Crawling the Guardian")
gd_vectors = crawlGuardian( article_count, word_count, ignore_list )
#print( gd_vectors )   

Crawling the Guardian
54 unique links found, trying to use using 50
    1: /culture/2018/jan/20/british-top-art-galleries-investigate-sexual-misconduct
    2: /uk-news/2018/jan/20/navy-museum-polaris-nuclear-weapons-exhibition-propaganda-cnd
    3: /business/2018/jan/19/frank-field-demands-answers-over-reckless-running-of-carillion
    4: /politics/2018/jan/20/jewish-labour-group-accuses-failing-act-antisemitism
    5: /society/2018/jan/19/channel-4-calls-in-security-experts-after-cathy-newman-suffers-online-abuse
    6: /money/2018/jan/20/milton-keynes-uk-capital-of-right-to-buy-to-let
    7: /politics/2018/jan/13/nigel-farage-rattled-peers-brexit-struggle-now-beginning
    8: /lifeandstyle/2018/jan/21/iwatched-my-son-die-from-cancer-lessons-i-have-learned-sacha-langton-gilks
    9: /politics/2018/jan/17/pmqs-verdict-corbyn-crushes-may-over-carillion
   10: /uk-news/2018/jan/20/sinn-fein-to-be-led-by-a-woman-mary-lou-mcdonald-for-the-first-time-in-its-modern-history
   11: /media/2017

In [19]:
# Here we'll sort out our training data, as we've just expanded the crawler to 
# include the full article text - we'll use this on a SciKit Naive Bayes and see
# how the analysis of the full page compares against our 25 key words.

training_data = gd_vectors + dm_vectors

In [12]:
# So, my version is based on possibly an oversimplification of the maths,
# that we went through at the start of the course. Essentially, that for a 
# NB classififer, we first calculate the measure of each class for a word 
# which is characterised by countClass(word) / countTotal(word)
# this was originally stated as Cspam(T) / Cspam(T) + Cham(t) (Spam Detection example)
# I've (possibly wrongly) adjusted the denominator to be the total - isn't that effectively
# what the sum of the two classes is?
# Then, going through each word, the value of Cclass(word) is multiplied by the value.

# So, having added in the MultinomialBayes, and seeing that it agrees with theirs, I'm
# curious to know why mine isn't (quite) on the same page. IT definitely seems that I'm
# missing something to do with the 'scaling', I wonder if that will sort it out.

class MyBayesNaive():
    
    def __init__( self, training_data ):
        self._training_data = training_data
        self._calculate_totals()
        self._calculate_weights()
        
    # So here we work out Cclass(word) and Ctotal(word)...
    def _calculate_totals( self ):
        self._word_totals     = defaultdict(int)
        self._class_instances = defaultdict(int)
        self._classes         = {}
        
        for vector in self._training_data:

            self._class_instances[ vector['source'] ] += 1

            if not vector['source'] in self._classes:
                self._classes[ vector['source'] ] = defaultdict(int)
            
            for word in vector['words']:
                self._word_totals[ word ] += 1
                self._classes[ vector['source'] ][ word ] += 1
                
    # and then we do the division...
    def _calculate_weights( self ):
        self._word_weights = {}
        
        for _class in self._classes.keys():
            # which happens here...
            # i = number of times word appears in  _class
            # CHANGE: from i/(total word instances) to i/(class instances in training data)
            # because i only repesents a count of instances containing the word,
            # this should work - we then need to scale in the test
            self._word_weights[ _class ] = { \
               word: float(i/self._class_instances[ _class ]) \
                   for word,i in self._classes[_class].items() }
            
            
    def test( self, text, word_count ):
        test_freqs  = count_words( text, ignore_list )
        test_vector = top_frequencies( word_count, test_freqs, 0.9, 0.1 )
        
        result = {}
        for _class in self._classes.keys():
            value = 1
            for word in test_vector:
                if word not in self._word_totals or word not in self._word_weights[ _class ]:
                    # CHANGE: previously, we'd done nothing here - now we divide by a 1000.
                    # Not sure why 1000, but as stated, we're nicking that from them.
                    # Maybe that's the same result as a 50%?
                    # Maybe the same as a very little significance?
                    # Need to investigate the maths.
                    value /= 1e3 # we'll steal this from them...
                    continue;
                value *= self._word_weights[ _class ][ word ]
           
            # CHANGE: added this scale at the end, too... so now the final result
            # is multiplied by the probability of the instance being this class
            result[ _class ] = value * \
                float( self._class_instances[_class] / sum(self._class_instances.values()) )
            
        
        return {source: result[source] for source in sorted( result, key=result.get, reverse=True ) }
        
bn = MyBayesNaive( training_data )

In [13]:
# So, some of their maths is a little different to mine...
# I'm not 100% correct it does what they say it does, but further investigation will
# surely lead us to an answer.

class TheirNaiveBayes():
    
    def __init__( self, training_data ):
        self._sum_frequencies( training_data )
    
    def _sum_frequencies( self, training_data ):
        self._frequencies = {}
        for vector in training_data:
            if vector['source'] not in self._frequencies:
                self._frequencies[ vector['source'] ] = defaultdict(int)
            
            for word in vector['words']:
                self._frequencies[ vector['source'] ][ word ] += 1
                
    def test( self, text, word_count ):
        test_freqs  = count_words( text, ignore_list )
        test_vector = top_frequencies( word_count, test_freqs, 0.9, 0.1 )
        
        class_results = defaultdict( lambda: 1 )
        for word in test_vector:
            for _class in self._frequencies.keys():
                
                if word in self._frequencies[ _class ]:
                    # so this ratio seems to be different to mine.. they calcuate the value of a word 
                    # as being the Cclass(word) / Cclass(all words)
                    # also, this feels computationally expensive - why sum it every time? It's not changing.
                    class_results[ _class ] *= 1e3* self._frequencies[ _class ][ word ] \
                        / float( sum(self._frequencies[_class].values()) )
                else:
                    # This makes sense - if the word isn't there, don't just times by zero
                    # that will instantly reset - I'm curious to know how they arrived at 1e3, though...
                    # ( for comparison, if the word wasn't included, mine skips the multiply )
                    class_results[ _class ] /= 1e3
            
        for _class in self._frequencies.keys():
            # I guess this sort of makes sense - you scale the number against the total number of 
            # words counted in the category - at least, in theory that's what it does.
            # see above, re: being computationally expensive.
            class_results[ _class ] *= float( sum(self._frequencies[_class].values()) ) \
                / ( float(sum(self._frequencies[_class].values())) )
                
        return { s: class_results[s] for s in sorted(class_results, key=class_results.get, reverse=True) }
                
tbn = TheirNaiveBayes( training_data )

In [14]:
# Ok, let's get fancy and try it with a scikit class..
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle

import numpy as np
import pandas as pd

pred_array = [ {word: 1 for word in v['words']} for v in training_data ]
targ_array = [ v['source']  for v in training_data ]
vectors    = pd.DataFrame( pred_array, index=targ_array )
vectors.fillna(0, inplace=True )

columns = vectors.axes[1]
classes = vectors.axes[0]

model = MultinomialNB()
model.fit( vectors.values, classes )

def testNaiveBayes( text, word_count ):
    test_freqs  = count_words( text, ignore_list )
    test_words  = top_frequencies( word_count, test_freqs, 0.9, 0.1 )
    test_vector = [  1 if word in test_words else 0 for word in columns ]
    return model.predict( [test_vector] )
    


In [17]:
# ...now lets do that again, but on the full body text.

datarows       = [ {'class': vector['source'], 'text': vector['text']} for vector in training_data ]
dataindex      = [ vector['href'] for vector in training_data ] 
fulltext_frame = pd.DataFrame( datarows, index=dataindex )
fulltext_frame = fulltext_frame.reindex( np.random.permutation(fulltext_frame.index) )

cv         = CountVectorizer()
ft_vectors = cv.fit_transform( fulltext_frame['text'].values )
ft_targets = fulltext_frame['class'].values

ft_model = MultinomialNB()
ft_model.fit( ft_vectors, ft_targets )

def testFullText( text ):
    test_texts   = [ text ]
    test_vectors = cv.transform( test_texts )
    return ft_model.predict( test_vectors )


In [18]:
# Let's test the BBC!
count = 30

print('Getting BBC links...')
bbc_soup  = soup_up('http://www.bbc.com/news')
bbc_links = [ a['href'] for a in bbc_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{6,}$') }) ]
link_list = process_links( bbc_links, count )

processed = 0;
for test_link in link_list:
    print('Testing ' + test_link )
    test_soup = soup_up('http://www.bbc.com' + test_link)
    print( test_soup.title.text )

    test_body    = test_soup.find('div', {'property':'articleBody'})
    if not test_body:
        count += 1
        print(' -- No test body found, continuing...\n')
        continue
    test_para    = test_body.find_all('p', recursive=False )
    if not test_para:
        print(' -- No paragraphs found, continuing...\n')
        continue
    
    test_text   = ' '.join([ p.text for p in test_para ])
    test_result = bn.test(test_text, word_count )
    nvby_result = testNaiveBayes( test_text, word_count )
    full_result = testFullText( test_text )

    print("   My result: " + ', '.join(["{}: {:2.2e}" \
       .format( source, score * 10000 ) for source, score in test_result.items() ]))
    
    their_result = tbn.test( test_text, word_count )
    print( "Their result: " + ', '.join(["{}: {:2.2e}" \
        .format( source, score ) for source, score in their_result.items() ]))
    
    print("Naive result: " + nvby_result[0] )
    print("FullT result: " + full_result[0] )
    
    print('')
    
    processed += 1
    if( processed >= count ):
        break
    

Getting BBC links...
41 unique links found, trying to use using 30
Testing /news/world-europe-42750584
Why Italians are saying 'No' to takeaway coffee - BBC News
 -- No test body found, continuing...

Testing /news/world-europe-42765092
French President says Donald Trump is not a 'classical politician' - BBC News
 -- No test body found, continuing...

Testing /news/world-us-canada-42651688
Trump's year on Twitter: Who has he criticised and praised the most? - BBC News
   My result: guardian: 3.28e-56, daily mail: 1.31e-58
Their result: guardian: 1.82e-45, daily mail: 1.82e-49
Naive result: guardian
FullT result: daily mail

Testing /news/world-europe-42707957
Putin cast as national saviour ahead of Russia election - BBC News
   My result: daily mail: 9.70e-58, guardian: 2.28e-59
Their result: daily mail: 1.35e-48, guardian: 7.89e-52
Naive result: daily mail
FullT result: daily mail

Testing /news/world-africa-42767234
Ethiopia Waldiya: Five killed by police at religious festival - BBC 

Football president George Weah puts Liberian army to the test - BBC News
 -- No test body found, continuing...

Testing /news/stories-42665317
I advertised for a man to get me pregnant - then I fell in love - BBC News
   My result: daily mail: 9.05e-27, guardian: 5.46e-30
Their result: daily mail: 5.10e-16, guardian: 4.72e-24
Naive result: daily mail
FullT result: daily mail

Testing /news/world-42736495
Why do names matter so much? - BBC News
   My result: daily mail: 2.16e-62, guardian: 1.54e-62
Their result: daily mail: 1.82e-56, guardian: 1.33e-56
Naive result: daily mail
FullT result: guardian

