From 05d3c4df03ebb882aedb09b66189fd5f44ef061a Mon Sep 17 00:00:00 2001 From: theopolisme Date: Wed, 31 Jul 2013 15:07:15 -0500 Subject: [PATCH] Use the API to get external links, correct UTM regex --- nomoretracking.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nomoretracking.py b/nomoretracking.py index 921a337..2372552 100755 --- a/nomoretracking.py +++ b/nomoretracking.py @@ -17,13 +17,12 @@ # CC-BY-SA Theopolisme -URL = re.compile(r"""((?:\w+:)?\/\/[^<>\[\]\s"]+)""",flags=re.UNICODE|re.DOTALL) -UTM = re.compile(r"""[\?&]utm_.*?=.*?(?=\s|&|$)""",flags=re.UNICODE|re.DOTALL) +UTM = re.compile(r"""[\?&]utm_.*?=.*?(?=\s|&|$|])""",flags=re.UNICODE|re.DOTALL) def process(page): contents = page.edit() contents_compare = contents - links = re.findall(URL,contents) + links = site.api('parse',text=contents,prop="externallinks")['parse']['externallinks'] for link in links: if link.find("utm") != -1: html_doc = requests.get(link).text