Skip to content

Commit

Permalink
Update description
Browse files Browse the repository at this point in the history
  • Loading branch information
zecoyote007 committed May 7, 2015
1 parent 4f46819 commit 7410fc9
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import urllib2
import urllib
import re
import HTMLParser
from lxml import etree
from lxml import html
from pyquery import PyQuery as pq

#Montpellier
Expand All @@ -18,8 +18,10 @@
'Cookie': 'TASession=%1%V2ID.25F62354D4F10DF7FFE74EAFC0FF81E3*SQ.2*LS.UserReviewController*GR.34*TCPAR.13*TBR.55*EXEX.39*ABTR.73*PPRP.83*PHTB.6*FS.76*CPU.50*HS.popularity*ES.popularity*AS.popularity*DS.5*SAS.popularity*FPS.oldFirst*DF.0*LP.%2FRestaurantSearch%3Fgeo%3D1221643*MS.19*RMS.-1*TRA.true*LD.1221643*FBH.2; SessionTest=true; CommercePopunder=SuppressUnload; ServerPool=X; TATravelInfo=V2*A.2*MG.-1*HP.2*FL.3*RS.-1; CM=%1%HanaPersist%2C%2C-1%7Cpu_vr2%2C%2C-1%7Ct4b-pc%2C%2C-1%7CHanaSession%2C%2C-1%7CFtrSess%2C%2C-1%7CRCPers%2C%2C-1%7CHomeAPers%2C%2C-1%7C+r_lf_1%2C%2C-1%7CWShadeSeen%2C%2C-1%7CRCSess%2C%2C-1%7C+r_lf_2%2C%2C-1%7Cpu_vr1%2C%2C-1%7CFtrPers%2C%2C-1%7CHomeASess%2C%2C-1%7CBPPers%2C1%2C1431507941%7CBPSess%2C1%2C-1%7CPU_quick1%2C%2C-1%7CPU_quick2%2C%2C-1%7Cvr_npu2%2C%2C-1%7CLastPopunderId%2C94-610-34217%2C-1%7Cpssamex%2C%2C-1%7Cvr_npu1%2C%2C-1%7Cbrandsess%2C%2C-1%7CCCPers%2C%2C-1%7CCCSess%2C%2C-1%7CWAR_RESTAURANT_FOOTER_SESSION%2C%2C-1%7Cbrandpers%2C%2C-1%7Csesssticker%2C%2C-1%7C%24%2C%2C-1%7Ct4b-sc%2C%2C-1%7Cviator_2%2C%2C-1%7CWarPopunder_Session%2C%2C-1%7Csess_rev%2C%2C-1%7Csessamex%2C%2C-1%7Cviator_1%2C%2C-1%7CWarPopunder_Persist%2C%2C-1%7CSaveFtrPers%2C%2C-1%7Cr_ta_2%2C%2C-1%7Cr_ta_1%2C%2C-1%7CSaveFtrSess%2C%2C-1%7Cpers_rev%2C%2C-1%7CRBASess%2C%2C-1%7Cperssticker%2C%2C-1%7CMetaFtrSess%2C%2C-1%7CRBAPers%2C%2C-1%7CWAR_RESTAURANT_FOOTER_PERSISTANT%2C%2C-1%7CMetaFtrPers%2C%2C-1%7C; TASSK=enc%3ARNniFej67m%2BOS2WmzABITVIvwBRyE34H5yY%2FWEDeRgajeI4bPVxeHAdUPz3chjcM; TAUnique=%1%enc%3AIhZfEIVqUMIbljTEYWK8objYfixl0BnB06b1atePIxQ2jHwltRJPGQ%3D%3D; roybatty=AAABTSh5M36KsTYAzgyHlHu%2BCh%2BGxCeEiP6F8w%3D%3D%2C1'}

email_regex = re.compile(r'(\b[\w.]+@+[\w.]+.+[\w.]\b)')
h = HTMLParser.HTMLParser()

def clean(input):
input = h.unescape(input)
input = make_unicode(input)
input = input.strip(' \t\n\r')
return input
Expand Down Expand Up @@ -48,8 +50,7 @@ def parse_page(root):
for el in root(".listing a.property_title"):
page_url = "http://www.tripadvisor.fr" + el.get("href")
page = get_url(page_url)
name = strip_tags(page("#HEADING_GROUP h1").html())
name = clean(html.fromstring(name).text)
name = clean(strip_tags(page("#HEADING_GROUP h1").html()))
rating = strip_tags(page(".sprite-rating_rr_fill").attr("alt"))
ranking = strip_tags(page(".slim_ranking").html())
hours = strip_tags(page(".hoursOverlay").html())
Expand Down

0 comments on commit 7410fc9

Please sign in to comment.