Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9d43a1b
commit b493ff5
Showing
14 changed files
with
357 additions
and
1 deletion.
There are no files selected for viewing
Empty file.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import pycurl | ||
import thread | ||
import time | ||
import config as cc | ||
from threading import Thread | ||
success=[] #list of uids with urls for which we need to set last_fetch=now() and next_fetch="2012-12-12" | ||
failure=[] #list of uids with urls for which we need to set last_fetch=now() and is_disabled=1 | ||
update=[] #list of uids, urls sperated by ::: | ||
insert=[] #list of urls with product id to be inserted | ||
invalid_domains=[] | ||
thread_started=[] | ||
#urls=["https://www.acuprice.com/Products/Overview/?id=M005926853:::247:::247"] | ||
urls=["com.singlefeed.reporting\t111\thttp://reporting.singlefeed.com/r/?cvsfa=2586&cvsfe=6&cvsfhu=343131303836&cvsfurl=http%3A%2F%2Fwww.gandermountain.com%2Fmodperl%2Ftrack.cgi%3FGSHOP_411086%2B%2Fmodperl%2Fproduct%2Fdetails.cgi%3Fpdesc%3DSlumberjack_Big_Cot%26i%3D411086%26r%3Dview%26aID%3D505B3\t123"] | ||
#urls=open("urls").readlines() | ||
#urls=urls[:10000] | ||
class Crawler(Thread): | ||
def __init__(self, id): | ||
#print id | ||
Thread.__init__(self) | ||
self.crawl() | ||
|
||
def crawl(self): | ||
try: | ||
thread_started.append("ok") | ||
try: | ||
#required urls(list) format: url(str/split):::filename(int):::product_id(int) | ||
domain, filename, url, product_id = urls.pop().split("\t") | ||
domain = domain.strip() | ||
if domain not in invalid_domains: | ||
filename=int(filename.strip()) | ||
url=str(url.strip()) | ||
product_id=int(product_id.strip()) | ||
fname = str(filename)+".uss" | ||
c = pycurl.Curl() | ||
c.fp = open(fname,"a+") | ||
c.setopt(pycurl.FOLLOWLOCATION, 1) | ||
c.setopt(pycurl.MAXREDIRS, 5) | ||
c.setopt(pycurl.CONNECTTIMEOUT, 30) | ||
c.setopt(pycurl.TIMEOUT, 300) | ||
c.setopt(pycurl.NOSIGNAL, 1) | ||
c.setopt(pycurl.URL, url) | ||
c.setopt(pycurl.WRITEDATA, c.fp) | ||
c.perform() | ||
original_url = str(url) | ||
final_url = str(c.getinfo(pycurl.EFFECTIVE_URL)) | ||
c.fp.close() | ||
if not c.errstr(): | ||
if(original_url==final_url): | ||
success.append(str(filename)) | ||
else: | ||
update.append(str(filename)+":::"+final_url) | ||
insert.append(str(original_url)+":::"+str(product_id)) | ||
else: | ||
print "oye" | ||
response_code = str(c.getinfo(pycurl.HTTP_CODE)) | ||
pattern = filename+":::"+response_code+chr(10) | ||
print "failure", pattern | ||
failure.append(pattern) | ||
else: | ||
failure.append(str(filename)+chr(10)) | ||
|
||
except Exception, ex: | ||
failure.append(str(filename)+chr(10)) | ||
print "===", int(ex[0]) | ||
if(ex[0]==6): | ||
invalid_domains.append(domain) | ||
print "Eerror:", ex | ||
pass | ||
|
||
try: | ||
thread_started.pop() | ||
except Exception, ex: | ||
print "Error:", ex | ||
pass | ||
except Exception, ex: | ||
print ex | ||
|
||
def run(pid, *args): | ||
print "Core thread", pid | ||
while True: | ||
t=Crawler(cc.id) | ||
t.start() | ||
t.join() | ||
cc.id+=1 | ||
|
||
x=0 | ||
while x<100: | ||
th="thread no:"+str(x) | ||
thread.start_new_thread(run,(th,2)) | ||
x+=1 | ||
|
||
|
||
while len(urls) > 1000: | ||
time.sleep(10) | ||
pass | ||
print "O got out of the loop", len(urls) | ||
|
||
s=open("success.log","w+") | ||
f=open("failure.log","w+") | ||
i=open("insert.log","w+") | ||
u=open("update.log","a+") | ||
invalid= open("invalid_domains.log","a+") | ||
while len(success)>0: | ||
s.write(success.pop()+chr(10)) | ||
s.close() | ||
while len(failure)>0: | ||
f.write(failure.pop()+chr(10)) | ||
f.close() | ||
while len(insert)>0: | ||
i.write(insert.pop()+chr(10)) | ||
i.close() | ||
while len(update)>0: | ||
u.write(update.pop()+chr(10)) | ||
u.close() | ||
while len(invalid_domains)>0: | ||
invalid.write(invalid_domains.pop()+chr(10)) | ||
invalid.close() |
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
com.singlefeed.reporting\t111\thttp://reporting.singlefeed.com/r/?cvsfa=2586&cvsfe=6&cvsfhu=343131303836&cvsfurl=http%3A%2F%2Fwww.gandermountain.com%2Fmodperl%2Ftrack.cgi%3FGSHOP_411086%2B%2Fmodperl%2Fproduct%2Fdetails.cgi%3Fpdesc%3DSlumberjack_Big_Cot%26i%3D411086%26r%3Dview%26aID%3D505B3\t123 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import socket | ||
from urlparse import urlparse | ||
|
||
class abc: | ||
def __init__(self): | ||
urls = ['http://docs.python.org/release/3.1.3/library/urllib.parse.html'] | ||
self.run(urls) | ||
|
||
def create_socket(self): | ||
return socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
|
||
def remove_socket(self, sock): | ||
sock.close() | ||
del sock | ||
|
||
def run(self, urls): | ||
for url in urls: | ||
scheme, self.host, path, params, query, fragment = urlparse(url) | ||
self.uri = url[url.find(self.host)+len(self.host):] | ||
self.send_request(self.create_socket()) | ||
|
||
def send_request(self, sock): | ||
l = '' | ||
print self.uri, self.host | ||
line1 = "GET %s HTTP/1.1"%(self.uri) | ||
line2 = "Host: %s"%(self.host) | ||
line3 = "Connection: close" | ||
for line in (line1, line2, line3): | ||
print "--", line | ||
l+= (line + "\r\n") | ||
sock.send(l) | ||
sock.send("\r\n") | ||
|
||
a = abc() | ||
|
||
|
||
''' | ||
sock = create_socket() | ||
print "Connecting" | ||
sock.connect( ('en.wikipedia.org', 80) ) | ||
print "Sending Request" | ||
import socket | ||
sock = socket.socket() | ||
sock.connect(('en.wikipedia.org', 80)) | ||
for line in ( | ||
"GET /wiki/List_of_HTTP_header_fields HTTP/1.1", | ||
"Host: en.wikipedia.org", | ||
"Connection: close", | ||
): | ||
sock.send(line + "\r\n") | ||
sock.send("\r\n") | ||
while True: | ||
content = sock.recv(1024) | ||
if content: | ||
print content | ||
else: | ||
break''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import socket | ||
from urlparse import urlparse | ||
|
||
class abc: | ||
def __init__(self): | ||
urls = ['http://docs.python.org/release/3.1.3/library/urllib.parse.html'] | ||
self.run(urls) | ||
|
||
def create_socket(self): | ||
return socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
|
||
def remove_socket(self, sock): | ||
sock.close() | ||
del sock | ||
|
||
def run(self, urls): | ||
for url in urls: | ||
scheme, self.host, path, params, query, fragment = urlparse(url) | ||
self.uri = url[url.find(self.host)+len(self.host):] | ||
self.send_request(self.create_socket()) | ||
|
||
def send_request(self, sock): | ||
l = [] | ||
print self.uri, self.host | ||
line1 = "GET %s HTTP/1.1"%(self.uri) | ||
line2 = "Host: %s"%(self.host) | ||
line3 = "Connection: close" | ||
for line in (line1, line2, line3): | ||
print "--", line | ||
l+= (line + "\r\n") | ||
sock.send(l) | ||
sock.send("\r\n") | ||
|
||
a = abc() | ||
|
||
|
||
''' | ||
sock = create_socket() | ||
print "Connecting" | ||
sock.connect( ('en.wikipedia.org', 80) ) | ||
print "Sending Request" | ||
|
||
import socket | ||
sock = socket.socket() | ||
sock.connect(('en.wikipedia.org', 80)) | ||
|
||
for line in ( | ||
"GET /wiki/List_of_HTTP_header_fields HTTP/1.1", | ||
"Host: en.wikipedia.org", | ||
"Connection: close", | ||
): | ||
sock.send(line + "\r\n") | ||
sock.send("\r\n") | ||
|
||
while True: | ||
content = sock.recv(1024) | ||
if content: | ||
print content | ||
else: | ||
break''' |