# Collect: importing RSS feeds

In [153]:
# RSS stands for Rich Site Summary and uses standard web feed formats to publish frequently updated information: 
# blog entries, news headlines, audio, video.

# feedparser parses RSS documnets

import feedparser

In [154]:
usa = feedparser.parse('https://www.reddit.com/r/usa/.rss')
eur = feedparser.parse('https://www.reddit.com/r/europe/.rss')

In [155]:
# The channel elements are available in 'feed' attribute

# The items are available in 'entries' ttribute, which is a list of dictionaries.

# You access items in the list in the same order in which they appear in the
# original feed, so the first item is available in d.entries[0].

In [156]:
# Print the title of the feed
usa['feed']['title']

'We The People'

In [157]:
# Each entry in the feed is a dictionary. Use [0] to print the first entry.
usa['entries'][0]['title']

'Flair!'

In [158]:
# let's print all the titles with their links
for entry in usa['entries']:
    print(entry['title'], entry['link'], sep=': ')
    print()

Flair!: https://www.reddit.com/r/usa/comments/2oxp3f/flair/

DC Heads to 100% Renewable Energy, a Symbolic Move for the Country: https://www.reddit.com/r/usa/comments/bgdkji/dc_heads_to_100_renewable_energy_a_symbolic_move/

Nevada governor signs bill raising clean energy standard: https://www.reddit.com/r/usa/comments/bgc1su/nevada_governor_signs_bill_raising_clean_energy/

Warren unveils $640 billion college debt forgiveness plan: https://www.reddit.com/r/usa/comments/bgdq73/warren_unveils_640_billion_college_debt/

More Americans than ever are leaving the Catholic Church after the sex abuse scandal. Here's why.: https://www.reddit.com/r/usa/comments/bgcl9a/more_americans_than_ever_are_leaving_the_catholic/

I am looking for some subreddits or sites about living in particular states: https://www.reddit.com/r/usa/comments/bg6ts5/i_am_looking_for_some_subreddits_or_sites_about/

North Dakota: Mandan Beautification Effort seeks economic growth through flowers: "its been proven all acros

In this example, we’ll take some data from reddit feeds from usa and europe . We’re going to see if people in both the cities use different words. If they do, what are the words they use? Can the words people
use give us some idea what’s important to people in different countries?

In [159]:
import numpy as np
import re

def createVocabList(dataSet):
    vocabList = set([])
    for document in dataSet:
        vocabList = vocabList | set(document)
    return list(vocabList)

def bagofWords2Vector(vocabList, inputDoc):
    returnVec = [0]*len(vocabList)
    for word in inputDoc:
        try:
            index = vocabList.index(word)
        except ValueError:
            print("The word {} is not contained in the vocabList".format(word))
        else:
            returnVec[index] += 1
    return np.array(returnVec)

def trainNB(trainMatrix, categoryList):
    numofWords = len(trainMatrix[0])
    numofDocuments = len(trainMatrix)
    p1Num = np.ones(numofWords); p0Num = np.ones(numofWords)
    p1Denom = 2.0; p0Denom = 2.0
    for i in range(numofDocuments):
        if categoryList[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix)
        elif categoryList[i] == 0:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix)
    p1Vec = np.log(p1Num/p1Denom)
    p0Vec = np.log(p0Num/p0Denom)
    pSpam = sum(categoryList)/numofDocuments
    return p1Vec, p0Vec, pSpam

def classifyNB(vector2classify, p1Vec, p0Vec, pSpam):
    pNotSpam = 1-pSpam
    p1 = sum(vector2classify * p1Vec) + np.log(pSpam)
    p0 = sum(vector2classify * p0Vec) + np.log(pNotSpam)
    if p1>p0:
        return 1
    else:
        return 0    
        

In [160]:
def textParse(bigString):
    listofTokens = [x for x in re.split('\W+', bigString) if len(x)>2]
    return listofTokens

In [161]:
# returns top x words with most occurence
def calcMostFrequency(vocabList, fullText):
    wordCount = {}
    for word in vocabList:
        wordCount[word] = fullText.count(word)
    sortedDict = sorted(wordCount, key = lambda x: x[1], reverse=True)
    return sortedDict[:2]

In [162]:
# this function takes feeds from two countries to train the model
# and classifies if the feed was from country1 or country2
def localWords(feed1, feed0): # usa-1, eur-0 # the feed should be loaded outside of this function
    docList=[]; classList=[]; fulltext=[]
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['title'])
        docList.append(wordList)
        fulltext.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['title'])
        docList.append(wordList)
        fulltext.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainSet=np.arange(2*minLen)
    testSet = np.random.choice(trainSet, 10)
    trainSet = np.delete(trainSet, testSet)
    trainMat  = []
    trainCatList = []
    for i in trainSet:
        trainMat.append(bagofWords2Vector(vocabList, docList[i]))
        trainCatList.append(classList[i])
    p1Vec, p0Vec, pClass1 = trainNB(trainMat, trainCatList)
    errorCount=0
    for i in testSet:
        wordVec = bagofWords2Vector(vocabList, docList[i])
        if classifyNB(wordVec, p1Vec, p0Vec, pClass1) != classList[i]:
            errorCount += 1
    print("the error rate is {}".format(errorCount/len(testSet)))
    return vocabList, p1Vec, p0Vec
        

In [163]:
localWords(usa, eur)

the error rate is 0.5


(['for',
  'plan',
  'here',
  'recently',
  'will',
  'raise',
  'increase',
  'Ambassador',
  'amid',
  'pilgrim',
  'poland',
  'Stars',
  '651',
  'run',
  'Retiring',
  'strangers',
  'during',
  'single',
  'NASA',
  'Blanc',
  'scandal',
  'From',
  'job',
  'particular',
  'Bans',
  'Spain',
  'from',
  'states',
  'like',
  'living',
  'healthy',
  'boost',
  'stop',
  'maintain',
  'killing',
  'demand',
  'festival',
  'Moody',
  'report',
  'stoning',
  'rise',
  'Europe',
  'fall',
  'Trump',
  'year',
  'leading',
  'mean',
  'raising',
  'Colorado',
  'Plastic',
  'make',
  'month',
  'common',
  'their',
  'popular',
  'planters',
  'Most',
  'Exhaustion',
  'Opportunity',
  'actual',
  'question',
  'change',
  'forgiveness',
  'sites',
  'its',
  'toward',
  'sex',
  'River',
  'that',
  'place',
  'bill',
  'Reportedly',
  'Eastern',
  'way',
  'frontier',
  'Serbia',
  'renewable',
  'affordable',
  'Government',
  'Luxembourg',
  'wood',
  'Chamonix',
  'week',
  '

# Analyze: displaying locally used words

In [181]:
# error in this function

# returns top words used in both the countries feed
# uses the probobality vector to do so
def getTopWords(usa, eur):
    vocabList, p1Vec, p0Vec = localWords(usa, eur)
    topUSA=[]; topEUR = []
    for i in range(len(p1Vec)):
        if p1Vec[i] > -0.7:
            topUSA.append((vocabList[i], p1Vec[i]))
        if p0Vec[i] > -0.7:
            topEUR.append((vocabList[i], p0Vec[i]))
    sortedUSA = sorted(topUSA, key=lambda x: x[1], reverse=True)
    print("----------United States of America------------", '\n')
    for i in sortedUSA:
        print(i[0])
    sortedEUR = sorted(topEUR, key=lambda x: x[1], reverse=True)
    print("----------Europe------------", '\n')
    for i in sortedEUR:
        print(i[0])
    

In [182]:
print(getTopWords(usa, eur))

the error rate is 0.3
----------United States of America------------ 

will
raise
increase
651
Blanc
job
like
healthy
stop
maintain
stoning
Colorado
make
popular
planters
change
its
toward
River
that
affordable
Chamonix
baskets
all
large
minimum
flowing
obsolete
Country
Coloradans
law
flowers
plants
gay
letter
wtsp
through
add
calibrated
bills
person
robot
family
Symbolic
death
House
things
100
Renewable
seeks
teachers
they
Brunei
Bossons
flower
Beautification
can
Effort
pesticides
built
strengthen
Heads
Dakota
your
France
towns
salary
defends
Move
get
hanging
ability
Housing
brand
North
com
want
live
approves
when
Idaho
simple
climate
soon
approach
housing
power
proven
would
Mandan
small
people
could
three
car
country
new
life
Energy
benefit
Glacier
been
Arve
aren
sorts
happened
support
----------Europe------------ 

will
raise
increase
651
Blanc
job
like
healthy
stop
maintain
stoning
Colorado
make
popular
planters
change
its
toward
River
that
affordable
Chamonix
baskets
all
large
min