|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +__author__ = 'Umputun' |
| 4 | + |
| 5 | +import logging |
| 6 | +import sys |
| 7 | +import time |
| 8 | +from datetime import datetime |
| 9 | +from email import utils |
| 10 | +import os |
| 11 | +import socket |
| 12 | +import feedparser |
| 13 | +from plumbum import cli |
| 14 | +import pymongo |
| 15 | + |
| 16 | +from config.config import feeds |
| 17 | +from config.config import settings |
| 18 | + |
| 19 | + |
| 20 | +root = logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s - %(message)s', stream=sys.stdout) |
| 21 | +NET_TIMEOUT = 15 |
| 22 | + |
| 23 | + |
| 24 | +class App(cli.Application): |
| 25 | + '''Feed-master utility''' |
| 26 | + |
| 27 | + PROGNAME = "feed-master" |
| 28 | + VERSION = "1.1" |
| 29 | + |
| 30 | + |
| 31 | +@App.subcommand("update") |
| 32 | +class UpdateItems(cli.Application): |
| 33 | + '''Update mongo from sources''' |
| 34 | + |
| 35 | + mongo_host = "127.0.0.1:27017" |
| 36 | + |
| 37 | + @cli.switch("--dbg", help="enable debug") |
| 38 | + def set_log_level(self): |
| 39 | + logging.root.setLevel(logging.DEBUG) |
| 40 | + |
| 41 | + @cli.switch(["--mongo", "-m"], str, help="set mongo server") |
| 42 | + def set_mongo(self, mongo_host): |
| 43 | + logging.debug("set mongo=%s", mongo_host) |
| 44 | + self.mongo_host = mongo_host |
| 45 | + |
| 46 | + def main(self): |
| 47 | + logging.info("feed loading initiated") |
| 48 | + socket.setdefaulttimeout(NET_TIMEOUT) |
| 49 | + |
| 50 | + if self.mongo_host.find(":") != -1: |
| 51 | + (mongo_ip, mongo_port) = self.mongo_host.split(":") |
| 52 | + mongo_client = pymongo.MongoClient(mongo_ip, int(mongo_port)) |
| 53 | + else: |
| 54 | + mongo_client = pymongo.MongoClient(self.mongo_host) |
| 55 | + |
| 56 | + db = mongo_client["feed_master"]["feed"] |
| 57 | + db.create_index([("published", -1)]) |
| 58 | + |
| 59 | + new_items = 0 |
| 60 | + for (name, feed) in feeds: |
| 61 | + logging.debug("loading %s - %s", name, feed) |
| 62 | + d = feedparser.parse(feed) |
| 63 | + last_items = d['entries'][:settings['max_items_per_feed']] |
| 64 | + for item in last_items: |
| 65 | + enclosure = [x for x in item['links'] if x['rel'] == 'enclosure'][0] |
| 66 | + title = item['title'] |
| 67 | + description = item['description'] |
| 68 | + if db.find_one({"_id": enclosure['href']}): |
| 69 | + logging.debug("already here, skip " + enclosure['href']) |
| 70 | + else: |
| 71 | + pub_dtime = datetime.fromtimestamp(time.mktime(item['published_parsed'])) |
| 72 | + if pub_dtime > datetime.now() or abs((pub_dtime - datetime.now()).total_seconds()) < 2 * 60 * 60: |
| 73 | + # for items in the future or close enough no now - reset timestamp |
| 74 | + logging.debug("timestamp adjusted to now for %s - %s", title, pub_dtime) |
| 75 | + pub_dtime = datetime.now() |
| 76 | + mrec = {"_id": enclosure['href'], 'enclosure': enclosure, 'title': title, |
| 77 | + 'description': description, 'published': pub_dtime} |
| 78 | + db.save(mrec) |
| 79 | + logging.info("new item %s %s", title, pub_dtime) |
| 80 | + new_items += 1 |
| 81 | + logging.info("feed loading completed, new items=%d", new_items) |
| 82 | + |
| 83 | + |
| 84 | +def format_datetime_rfc2822(dt): |
| 85 | + return utils.formatdate(time.mktime(dt.timetuple())) |
| 86 | + |
| 87 | + |
| 88 | +@App.subcommand("generate") |
| 89 | +class GenerateFeed(cli.Application): |
| 90 | + '''Generate RSS feed''' |
| 91 | + |
| 92 | + mongo_host = "127.0.0.1:27017" |
| 93 | + feed_file = "feed.xml" |
| 94 | + |
| 95 | + @cli.switch("--dbg", help="enable debug") |
| 96 | + def set_log_level(self): |
| 97 | + logging.root.setLevel(logging.DEBUG) |
| 98 | + |
| 99 | + @cli.switch(["--mongo", "-m"], str, help="set mongo server") |
| 100 | + def set_mongo(self, mongo_host): |
| 101 | + logging.debug("set mongo=%s", mongo_host) |
| 102 | + self.mongo_host = mongo_host |
| 103 | + |
| 104 | + @cli.switch(["--file", "-f"], str, help="set feed file") |
| 105 | + def set_feed_file(self, feed_file): |
| 106 | + logging.info("set feed file=%s", feed_file) |
| 107 | + self.feed_file = feed_file |
| 108 | + |
| 109 | + def main(self): |
| 110 | + logging.info("feed generation initiated") |
| 111 | + |
| 112 | + if self.mongo_host.find(":") != -1: |
| 113 | + (mongo_ip, mongo_port) = self.mongo_host.split(":") |
| 114 | + mongo_client = pymongo.MongoClient(mongo_ip, int(mongo_port)) |
| 115 | + else: |
| 116 | + mongo_client = pymongo.MongoClient(self.mongo_host) |
| 117 | + db = mongo_client["feed_master"]["feed"] |
| 118 | + |
| 119 | + items = db.find().sort("published", -1).limit(settings['max_items_total']) |
| 120 | + last_date = format_datetime_rfc2822(db.find_one(sort=[("published", -1)])['published']) |
| 121 | + total_items = 0 |
| 122 | + with open(self.feed_file + ".tmp", "w") as feed_file: |
| 123 | + |
| 124 | + rss_header = """ |
| 125 | + <rss xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:content="http://purl.org/rss/1.0/modules/content/" |
| 126 | + xmlns:media="http://search.yahoo.com/mrss/" xmlns:yt="http://gdata.youtube.com/schemas/2007" |
| 127 | + xmlns:atom="http://www.w3.org/2005/Atom" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0"> |
| 128 | + \n<channel>\n |
| 129 | + """ |
| 130 | + feed_file.write(rss_header) |
| 131 | + feed_file.write("<title>%s</title>\n" % settings['info']['title'].encode('utf-8')) |
| 132 | + feed_file.write("<description>%s</description>\n" % settings['info']['description'].encode('utf-8')) |
| 133 | + feed_file.write("<link>%s</link>\n" % settings['info']['link'].encode('utf-8')) |
| 134 | + feed_file.write("<pubDate>%s</pubDate>\n" % last_date) |
| 135 | + feed_file.write("<language>%s</language>\n" % settings['language']) |
| 136 | + feed_file.write("<generator>feed-master by Umputun</generator>\n") |
| 137 | + |
| 138 | + for item in items: |
| 139 | + try: |
| 140 | + feed_file.write("<item>\n") |
| 141 | + feed_file.write("<title>%s</title>\n" % item['title'].encode('utf-8')) |
| 142 | + feed_file.write("<description>%s</description>\n" % item['description'].encode('utf-8')) |
| 143 | + feed_file.write("<link>%s</link>\n" % item['_id'].encode('utf-8')) |
| 144 | + feed_file.write("<pubDate>%s</pubDate>\n" % format_datetime_rfc2822(item['published'])) |
| 145 | + feed_file.write("<guid>%s</guid>\n" % item['_id'].encode('utf-8')) |
| 146 | + feed_file.write('<enclosure length="%s" type="audio/mpeg" url="%s"/>' % |
| 147 | + (item['enclosure']['length'], item['enclosure']['href'])) |
| 148 | + feed_file.write("</item>\n") |
| 149 | + total_items += 1 |
| 150 | + except Exception, e: |
| 151 | + logging.warn("failed to write %s, error=%s", item, e) |
| 152 | + |
| 153 | + feed_file.write('</channel>\n</rss>\n') |
| 154 | + |
| 155 | + os.rename(self.feed_file + '.tmp', self.feed_file) |
| 156 | + logging.info("feed generation completed, total feeds=%d, total items=%d", len(feeds), total_items) |
| 157 | + |
| 158 | + |
| 159 | +if __name__ == "__main__": |
| 160 | + App.run() |
0 commit comments