-
Notifications
You must be signed in to change notification settings - Fork 0
/
TravelPodParser.py
105 lines (82 loc) · 3.9 KB
/
TravelPodParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
from datetime import datetime
import BeautifulSoup
import Parser
import Util
import ElasticMappings
class TravelPodBlogParser (Parser.BlogParser):
def parsemaincontent (self):
postBody = self.soup.find(id="post")
postBodyText = ''
for t in postBody.contents:
if type(t) is BeautifulSoup.NavigableString:
postBodyText += t
elif type(t) is BeautifulSoup.Tag and t.name == 'br':
postBodyText += '\n'
firstthumbnailimagediv = self.soup.find('div', attrs={'class':re.compile('^inline-thumb')})
if (firstthumbnailimagediv):
thumbnailimage = firstthumbnailimagediv.findNext('img')['src']
thumbnailimage = thumbnailimage.replace ("xlarge", "large")
self.blog.thumbnailimage = thumbnailimage
self.blog.body = postBodyText
self.blog.title = self.soup.find("meta", {"name": "twitter:name"})['content']
def parselocation(self):
locationStack = []
titleContents = self.soup.find("p", attrs={'class' : 'meta'}).contents
for t in titleContents:
if type(t) is BeautifulSoup.Tag:
if t.name == 'a':
locationStack.append(t.string)
elif t.name == 'span':
self.blog.postdate = datetime.strptime(t.string, '%A, %B %d, %Y')
self.blog.city = ''
self.blog.state = ''
self.blog.country = ''
if (len(locationStack) == 3):
self.blog.country = locationStack[2].lower()
self.blog.state = locationStack[1].lower()
self.blog.city = locationStack[0].lower()
elif (len(locationStack) == 2):
self.blog.country = locationStack[1].lower()
self.blog.state = locationStack[0].lower()
else:
raise NotImplementedError("Single location field not handled yet")
self.isafrica = Util.isafrica(self.blog.country)
if (self.isafrica == False):
self.logger.info(self.blog.country + " is not an African country")
def getauthorurl (self):
authorHref = self.soup.find("div", attrs={'class':'profile'}).findNext ('a')['href']
authorLink = "http://www.travelpod.com" + authorHref
return authorLink
def parsetrip (self):
self.blog.trip = 'http://www.travelpod.com' + self.soup.find("a", attrs={'title' : 'See more entries in this travel blog'})['href']
class TravelPodAuthorParser (Parser.AuthorParser):
def parselogsummary (self):
self.author.username = self.soup.find("meta", {"property":"og:title"})['content']
summary = self.soup.find("div", attrs={'class' : 'bubble'}).contents
for t in summary:
if type(t) is BeautifulSoup.Tag:
if t.name == 'span':
blogcount = str(t.string).split(' ')[0]
self.author.blogcount = blogcount.translate(None, ",")
self.author.photo = self.soup.find(id="profile_pic")['src']
class TravelPodAuthorTripParser (Parser.Parser):
def getitemid(self):
return False
def parsebloglinks (self):
self.bloglist = []
for div in self.soup.findAll("div", {"class":"blog_data"}):
self.bloglist.append(div.contents[1]['href'])
return self.bloglist
class TravelPodMainSectionParser (Parser.Parser):
def getitemid(self):
return False
def loaditem(self, forcereindex = True, cookiedict = None):
cookiedict = {'tweb_order':'time'} #sort by time so we can parse latest first (instead of most popular)
Parser.Parser.loaditem(self,forcereindex, cookiedict)
def parsebloglinks (self):
self.bloglist = []
for div in self.soup.findAll('div', attrs={'class':re.compile('^blog_info$')}):
if (Util.istextenglish(div.findAll('p')[1].text)):
self.bloglist.append(div.findNext ('a')['href']);
return self.bloglist