Merge branch 'master' of https://github.com/unkn0w7n/calibre

unkn0w7n · Feb 2, 2024 · eebec37 · eebec37
2 parents c48d070 + 26f78ac
commit eebec37
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 28 deletions.
diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe
@@ -167,6 +167,8 @@ class NatGeo(BasicNewsRecipe):
         for article in soup.findAll('article'):
             a = article.find('a')
             url = a['href']
+            if url.startswith('/'):
+                url = 'https://www.nationalgeographic.com' + url
             section = self.tag_to_string(article.find(**classes('SectionLabel')))
             if section.startswith('Paid Content'):
                 continue

diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe
@@ -146,6 +146,8 @@ class NatGeo(BasicNewsRecipe):
         for article in soup.findAll('article'):
             a = article.find('a')
             url = a['href']
+            if url.startswith('/'):
+                url = 'https://www.nationalgeographic.com' + url
             title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated')))
             ans.append({'title': title, 'url': url})
             self.log(title, '  ', url)

diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe
@@ -156,12 +156,14 @@ class NatGeo(BasicNewsRecipe):
             title = self.tag_to_string(photoart)
             url = photoart['href']
             if url.startswith('/'):
-                url = 'https://www.nationalgeographic.com' + photoart['href']
+                url = 'https://www.nationalgeographic.com' + url
             ans2.append(('Photo Essay', [{'title': title, 'url': url}]))
         for gird in soup.findAll(attrs={'class':'GridPromoTile'}):
             for article in soup.findAll('article'):
                 a = article.find('a')
                 url = a['href']
+                if url.startswith('/'):
+                    url = 'https://www.nationalgeographic.com' + url
                 if '/graphics/' in url:
                     continue
                 section = self.tag_to_string(article.find(**classes('SectionLabel')))

diff --git a/recipes/science_news.recipe b/recipes/science_news.recipe
@@ -5,7 +5,7 @@ __license__ = 'GPL v3'
 sciencenews.org
 '''
 
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
 import datetime
 import re
 
@@ -16,13 +16,13 @@ class ScienceNewsIssue(BasicNewsRecipe):
                    " in all fields of science. This recipe downloads all the articles from the latest issue.")
     category = u'Science, Technology, News'
     publisher = u'Society for Science & the Public'
-    oldest_article = 14
     language = 'en'
-    max_articles_per_feed = 50
     no_stylesheets = True
     use_embedded_content = False
-    timefmt = ' [%A, %d %B, %Y]'
     auto_cleanup = False
+    remove_attributes = ['height', 'width', 'style']
+    ignore_duplicate_articles = {'url'}
+    resolve_internal_links = True
 
     keep_only_tags = [
         dict(
@@ -37,20 +37,23 @@ class ScienceNewsIssue(BasicNewsRecipe):
         )
     ]
     remove_tags = [
+        dict(name=['svg', 'button']),
         dict(
             attrs={'class': lambda x: x and ('newsletter-signup__wrapper___' in x)}
         )
     ]
 
     def parse_index(self):
 
-        # Get URL of latest mag page
-        ld = self._get_mag_date()
-        url = f"https://www.sciencenews.org/sn-magazine/{ld:%B}-{ld.day}-{ld.year}"
-        url = url.lower()
+        index = self.index_to_soup('https://www.sciencenews.org/sn-magazine')
+        a = index.find(**prefixed_classes('magazine-archive__issue-thumbnail___'))
+        url = a['href']
+        self.timefmt = ' [' + url.split('/')[-1] + ']'
+        self.cover_url = a.img['src']
 
         # Get articles
         soup = self.index_to_soup(url)
+        soup = soup.find('main', attrs={'id':'content'})
         re_article = re.compile("https://www.sciencenews.org/article/")
         stories = []
         past_urls = set()
@@ -68,6 +71,7 @@ class ScienceNewsIssue(BasicNewsRecipe):
                 continue
 
             past_urls.add(article_url)
+            self.log('\t', article_title, ' ', article_url)
             article_info = {
                 "url": article_url,
                 "title": article_title,
@@ -78,22 +82,3 @@ class ScienceNewsIssue(BasicNewsRecipe):
             ("Articles", stories),
         ]
         return index
-
-    def _get_mag_date(self):
-        """Return date of latest magazine issue.
-        It is published every 2 weeks."""
-
-        d = datetime.date(2022, 6, 18)
-        t = datetime.date.today()
-        ld = None
-        while d <= t:
-            ld = d
-            d += datetime.timedelta(days=14)
-        return ld
-
-    def get_cover_url(self):
-        ld = self._get_mag_date()
-        url = ld.strftime(
-            "https://www.sciencenews.org/wp-content/uploads/%Y/%m/%m%d%y_cover.jpg"
-        )
-        return url