#### This exercise will classify news articles into tech and non-tech categories with the two separate algorithms.

#### To begin, here's some boilerplate:

In [3]:
import requests
import urllib.request as urlreq
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict
from heapq import nlargest
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [25]:
def getWaPoText(url,token):
    try:
        req = urlreq.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
        page = urlreq.urlopen(req).read().decode('utf8')
    except:
        return (None,None)
    soup = BeautifulSoup(page,'html.parser')
    if soup is None:
        return (None,None)
    text = ''
    if soup.find_all(token) is not None:
        text = ''.join(map(lambda p: p.text, soup.find_all(token)))
        #TODO: check if this step is necessary.
        soup2 = BeautifulSoup(text,'html.parser')
        if soup2.find_all('p') != []:
            text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text        

In [5]:
def getNYTText(url,token):
    response = requests.get(url)
    soup = BeautifulSoup(response.content,'html.parser')
    page = str(soup)
    title = soup.find('title').text
    mydivs = soup.findAll('p',{'class':'story-body-text story-content'})
    text = ''.join(map(lambda p: p.text, mydivs))
    return title, text

In [24]:
def scrapeSource(url,magicFrag='2015',scraperFunction=getNYTText,token='None'):
    urlBodies = {}
    request = urlreq.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
    response = urlreq.urlopen(request)
    soup = BeautifulSoup(response,'html.parser')
    numErrors = 0
    for a in soup.findAll('a'):
        try:
            url = a['href']
            if((url not in urlBodies) and
               ((magicFrag is not None and magicFrag in url) or magicFrag is None)):
                body = scraperFunction(url,token)
                if body and len(body) > 0:
                    urlBodies[url] = body
                print(url)
        except:
            numErrors += 1
    return urlBodies

In [131]:
class FrequencySummarizer:
    def __init__(self,min_cut=0.1,max_cut=0.9):
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') + 
                              list(punctuation) + 
                              [u"'s",'"'])
    
    def _compute_frequencies(self,word_sent,customStopWords=None):
        freq = defaultdict(int)
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)
        for sentence in word_sent:
            for word in sentence:
                if word not in stopwords:
                    freq[word] += 1
        if len(freq) > 0:
            m = float(max(freq.values()))
            for word in list(freq.keys()):
                freq[word] = freq[word]/m
                if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                    del freq[word]
        return freq
    
    def extractFeatures(self,article,n,customStopWords=None):
        title = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent,customStopWords)
        if n < 0:
            # If the user asks for a negative number of features selected, 
            # then all words are returned.
            return nlargest(len(self._freq.keys()),self._freq,key=self._freq.get)
        else:
            # Otherwise, only return the 'n' largest features.
            return nlargest(n,self._freq,key=self._freq.get)
    
    def extractRawFrequencies(self,article):
        title = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
    
    def summarize(self,article,n):
        title = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n,ranking,key=ranking.get)
        return [sentences[j] for j in sentences_index]

In [21]:
urlWashingtonPostNonTech = "https://www.washingtonpost.com/sports"
urlNewYorkTimesNonTech = "https://www.nytimes.com/pages/sports/index.html"
urlWashingtonPostTech = "https://www.washingtonpost.com/business/technology"
urlNewYorkTimesTech = "http://www.nytimes.com/pages/technology/index.html"

In [26]:
washingtonPostTechArticles = scrapeSource(urlWashingtonPostTech,
                                          '2016',
                                         getWaPoText,
                                         'article') 
washingtonPostNonTechArticles = scrapeSource(urlWashingtonPostNonTech,
                                          '2016',
                                         getWaPoText,
                                         'article')
                
                
newYorkTimesTechArticles = scrapeSource(urlNewYorkTimesTech,
                                       '2016',
                                       getNYTText,
                                       None)
newYorkTimesNonTechArticles = scrapeSource(urlNewYorkTimesNonTech,
                                       '2016',
                                       getNYTText,
                                       None)


https://www.washingtonpost.com/news/on-leadership/wp/2016/11/23/the-avid-reading-habits-of-trumps-potential-secretary-of-defense-james-mad-dog-mattis/
https://www.washingtonpost.com/news/grade-point/wp/2016/12/01/a-school-official-called-for-compassion-for-the-ohio-state-attacker-people-are-demanding-she-be-fired/
https://www.washingtonpost.com/news/worldviews/wp/2016/12/02/russias-obsession-with-whether-prime-minister-dmitry-medvedev-is-asleep-or-not/
https://www.washingtonpost.com/news/worldviews/wp/2016/12/01/a-missile-strike-kills-the-beloved-clown-of-aleppo/
https://www.washingtonpost.com/news/true-crime/wp/2016/12/02/mysterious-disappearance-of-mother-and-her-two-sons-leads-to-horrific-ending/
https://www.washingtonpost.com/local/the-financial-secrecy-behind-white-nationalist-group-known-for-hail-trump-nazi-salutes/2016/12/01/ae8e2e0a-b7d1-11e6-a677-b608fbb3aaf6_story.html?utm_term=.d95ed03445ac
https://www.washingtonpost.com/news/true-crime/wp/2016/12/01/youre-helping-her-im-goi

In [36]:
articleSummaries = {}
for techUrlDictionary in [newYorkTimesTechArticles, washingtonPostTechArticles]:
    for articleUrl in techUrlDictionary:
        if techUrlDictionary[articleUrl][0] is not None:
            if len(techUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(techUrlDictionary[articleUrl],25)
                articleSummaries[articleUrl] = {'feature-vector':summary,'label':'Tech'}
for nontechUrlDictionary in [newYorkTimesNonTechArticles, washingtonPostNonTechArticles]:
    for articleUrl in nontechUrlDictionary:
        if nontechUrlDictionary[articleUrl][0] is not None:
            if len(nontechUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(nontechUrlDictionary[articleUrl],25)
                articleSummaries[articleUrl] = {'feature-vector':summary,'label':'Non-Tech'}

In [37]:
def getDoxyDonkeyTest(testUrl,token):
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content,'html.parser')
    page = str(soup)
    title = soup.find('title').text
    mydivs = soup.findAll('div',{'class':token})
    text = ''.join(map(lambda p: p.text, mydivs))
    return title,text

testUrl = "http://doxydonkey.blogspot.in"
testArticle = getDoxyDonkeyTest(testUrl,'post-body')

fs = FrequencySummarizer()
testArticleSummary = fs.extractFeatures(testArticle,25)

In [38]:
similarities = {}
for articleUrl in articleSummaries:
    oneArticleSummary = articleSummaries[articleUrl]['feature-vector']
    similarities[articleUrl] = len(set(testArticleSummary).intersection(set(oneArticleSummary)))

labels = defaultdict(int)
knn = nlargest(5, similarities, key=similarities.get)
for oneNeighbor in knn:
    labels[articleSummaries[oneNeighbor]['label']] += 1
    
nlargest(1,labels,key=labels.get)

['Tech']

In [39]:
cumulativeRawFrequencies = {'Tech':defaultdict(int),'Non-Tech':defaultdict(int)}
trainingData = {'Tech':newYorkTimesTechArticles,'Non-Tech':newYorkTimesNonTechArticles}
for label in trainingData:
    for articleUrl in trainingData[label]:
        if len(trainingData[label][articleUrl][0]) > 0:
            fs = FrequencySummarizer()
            rawFrequencies = fs.extractRawFrequencies(trainingData[label][articleUrl])
            for word in rawFrequencies:
                cumulativeRawFrequencies[label][word] += rawFrequencies[word]

In [42]:
techiness = 1.0
nontechiness = 1.0
for word in testArticleSummary:
    if word in cumulativeRawFrequencies['Tech']:
        techiness *= 1e3*cumulativeRawFrequencies['Tech'][word]/float(sum(
                cumulativeRawFrequencies['Tech'].values()))
    else:
        techiness /= 1e3
    if word in cumulativeRawFrequencies['Non-Tech']:
        nontechiness *= 1e3*cumulativeRawFrequencies['Non-Tech'][word]/float(sum(
                cumulativeRawFrequencies['Non-Tech'].values()))
    else:
        nontechiness /= 1e3

In [45]:
techiness *= float(sum(cumulativeRawFrequencies['Tech'].values()))/(float(sum(
    cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values())))
nontechiness *= float(sum(cumulativeRawFrequencies['Non-Tech'].values()))/(float(sum(
        cumulativeRawFrequencies['Non-Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values())))

if techiness > nontechiness:
    label = 'Tech'
else:
    label = 'Non-Tech'
print(label, techiness, nontechiness)

Tech 3.163443192158864e-08 8.101507720377187e-27


In [52]:
def getAllDoxyDonkeyPosts(url,links):
    request = urlreq.Request(url)
    response = urlreq.urlopen(request)
    soup = BeautifulSoup(response,'html.parser')
    for a in soup.findAll('a'):
        try:
            url = a['href']
            title = a['title']
            if title == 'Older Posts':
                print(title, url)
                links.append(url)
                getAllDoxyDonkeyPosts(url,links)
        except:
            title = ''
    return

In [54]:
blogUrl = "http://doxydonkey.blogspot.in"
links = []
getAllDoxyDonkeyPosts(blogUrl, links)


Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-09-09T07:34:00-07:00&max-results=7
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-08-28T20:08:00-07:00&max-results=7&start=7&by-date=false
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-08-17T19:24:00-07:00&max-results=7&start=14&by-date=false
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-08-07T20:30:00-07:00&max-results=7&start=21&by-date=false
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-07-26T19:55:00-07:00&max-results=7&start=28&by-date=false
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-07-17T19:47:00-07:00&max-results=7&start=35&by-date=false
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-07-06T19:34:00-07:00&max-results=7&start=42&by-date=false
Older Posts http://doxydonkey.blogspot.in/search?updated-max=2016-06-26T19:36:00-07:00&max-results=7&start=49&by-date=false
Older Posts http://doxydonkey.bl

In [82]:
doxyDonkeyPosts = {}
for link in links:
    content = getDoxyDonkeyTest(link,'post-body')
    print(content)
    doxyDonkeyPosts[link] = content

("Onager's Daily Tech Snippets", '\n\n\nWhat’s Really Missing From the New iPhone: Dazzle\xa0Forget about the headphone jack for a second.\xa0Sure, it’s pretty annoying that Apple’s newest iPhones — the 7 and 7 Plus, which were unveiled in San Francisco on Wednesday and will start shipping to customers on Sept. 16 — will not include a port for plugging in standard earbuds. But you’ll get used to it.\xa0The absence of a jack is far from the worst shortcoming in Apple’s latest product launch. Instead, it’s a symptom of a deeper issue with the new iPhones, part of a problem that afflicts much of the company’s product lineup: Apple’s aesthetics have grown stale.\xa0Apple has squandered its once-commanding lead in hardware and software design. Though the new iPhones include several new features, including water resistance and upgraded cameras, they look pretty much the same as the old ones. The new Apple Watch does too. And as competitors have borrowed and even begun to surpass Apple’s best

In [108]:
documentCorpus = []
for onePost in doxyDonkeyPosts.values():
    documentCorpus.append(onePost[1])

In [132]:
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(documentCorpus)
km = KMeans(n_clusters=5,init='k-means++',max_iter=100,n_init=1,verbose=True)
km.fit(X)

keywords = {}
for i,cluster in enumerate(km.labels_):
    oneDocument = documentCorpus[i]
    fs = FrequencySummarizer()
    summary = fs.extractFeatures(('',oneDocument),
                                100,
                                [u'according',u'also',u'billion',u'like',u'new',u'one',u'year',u'first',u'last'])
    if cluster not in keywords:
        keywords[cluster] = set(summary)
    else:
        keywords[cluster] = keywords[cluster].intersection(set(summary))

Initialization complete
Iteration  0, inertia 109.496
Iteration  1, inertia 56.543
Converged at iteration 1


In [133]:
keywords

{0: {"''",
  '``',
  'business',
  'china',
  'companies',
  'even',
  'facebook',
  'firm',
  'including',
  'investors',
  'market',
  'people',
  'percent',
  'revenue',
  'software',
  'two',
  'users',
  'would',
  'years',
  '—'},
 1: {'amazon',
  'apple',
  'business',
  'companies',
  'market',
  'online',
  'sales',
  'would',
  'years',
  '—'},
 2: set(),
 3: {'apple', 'companies', 'google'},
 4: {"''", 'amazon', 'could', 'mobile', 'people', 'sales', 'service'}}