Skip to content
Newer
Older
100644 118 lines (109 sloc) 3.27 KB
b4fa81a @utsavsabharwal added dragos and penelope
authored Apr 6, 2012
1 import pycurl
2 import thread
3 import time
4 import config as cc
5 from threading import Thread
6 success=[] #list of uids with urls for which we need to set last_fetch=now() and next_fetch="2012-12-12"
7 failure=[] #list of uids with urls for which we need to set last_fetch=now() and is_disabled=1
8 update=[] #list of uids, urls sperated by :::
9 insert=[] #list of urls with product id to be inserted
10 invalid_domains=[]
11 thread_started=[]
12 #urls=["https://www.acuprice.com/Products/Overview/?id=M005926853:::247:::247"]
b493ff5 @utsavsabharwal added socket_basic
authored Apr 10, 2012
13 urls=["com.singlefeed.reporting\t111\thttp://reporting.singlefeed.com/r/?cvsfa=2586&cvsfe=6&cvsfhu=343131303836&cvsfurl=http%3A%2F%2Fwww.gandermountain.com%2Fmodperl%2Ftrack.cgi%3FGSHOP_411086%2B%2Fmodperl%2Fproduct%2Fdetails.cgi%3Fpdesc%3DSlumberjack_Big_Cot%26i%3D411086%26r%3Dview%26aID%3D505B3\t123"]*200
14 #urls=open("urls").readlines()
b4fa81a @utsavsabharwal added dragos and penelope
authored Apr 6, 2012
15 #urls=urls[:10000]
16 class Crawler(Thread):
17 def __init__(self, id):
18 #print id
19 Thread.__init__(self)
20 self.crawl()
21
22 def crawl(self):
23 try:
24 thread_started.append("ok")
25 try:
26 #required urls(list) format: url(str/split):::filename(int):::product_id(int)
27 domain, filename, url, product_id = urls.pop().split("\t")
28 domain = domain.strip()
29 if domain not in invalid_domains:
30 filename=int(filename.strip())
31 url=str(url.strip())
32 product_id=int(product_id.strip())
33 fname = str(filename)+".uss"
34 c = pycurl.Curl()
35 c.fp = open(fname,"a+")
36 c.setopt(pycurl.FOLLOWLOCATION, 1)
37 c.setopt(pycurl.MAXREDIRS, 5)
38 c.setopt(pycurl.CONNECTTIMEOUT, 30)
39 c.setopt(pycurl.TIMEOUT, 300)
40 c.setopt(pycurl.NOSIGNAL, 1)
41 c.setopt(pycurl.URL, url)
42 c.setopt(pycurl.WRITEDATA, c.fp)
43 c.perform()
44 original_url = str(url)
45 final_url = str(c.getinfo(pycurl.EFFECTIVE_URL))
46 c.fp.close()
47 if not c.errstr():
48 if(original_url==final_url):
49 success.append(str(filename))
50 else:
51 update.append(str(filename)+":::"+final_url)
52 insert.append(str(original_url)+":::"+str(product_id))
53 else:
54 print "oye"
55 response_code = str(c.getinfo(pycurl.HTTP_CODE))
56 pattern = filename+":::"+response_code+chr(10)
57 print "failure", pattern
58 failure.append(pattern)
59 else:
60 failure.append(str(filename)+chr(10))
61
62 except Exception, ex:
63 failure.append(str(filename)+chr(10))
64 print "===", int(ex[0])
65 if(ex[0]==6):
66 invalid_domains.append(domain)
67 print "Eerror:", ex
68 pass
69
70 try:
71 thread_started.pop()
72 except Exception, ex:
73 print "Error:", ex
74 pass
75 except Exception, ex:
76 print ex
77
78 def run(pid, *args):
79 print "Core thread", pid
80 while True:
81 t=Crawler(cc.id)
82 t.start()
83 t.join()
84 cc.id+=1
85
86 x=0
87 while x<100:
88 th="thread no:"+str(x)
89 thread.start_new_thread(run,(th,2))
90 x+=1
91
92
93 while len(urls) > 1000:
94 time.sleep(10)
95 pass
96 print "O got out of the loop", len(urls)
97
98 s=open("success.log","w+")
99 f=open("failure.log","w+")
100 i=open("insert.log","w+")
101 u=open("update.log","a+")
102 invalid= open("invalid_domains.log","a+")
103 while len(success)>0:
104 s.write(success.pop()+chr(10))
105 s.close()
106 while len(failure)>0:
107 f.write(failure.pop()+chr(10))
108 f.close()
109 while len(insert)>0:
110 i.write(insert.pop()+chr(10))
111 i.close()
112 while len(update)>0:
113 u.write(update.pop()+chr(10))
114 u.close()
115 while len(invalid_domains)>0:
116 invalid.write(invalid_domains.pop()+chr(10))
117 invalid.close()
Something went wrong with that request. Please try again.