In [1]:
import os,requests,re,json,time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from mako.template import Template
os.environ['https_proxy'] = 'http://127.0.0.1:7890'
os.environ['http_proxy'] = 'http://127.0.0.1:7890'

def fetchCover(link,folderName):
    r = requests.get(link)
    doc = BeautifulSoup(r.content)
    title = doc.find(class_="row article-header--metadata-title")
    titlename = title.find(class_="f-serif ls-0 article-title pt-2").text
    post = doc.find(class_="container article-container")
    for i in post.findAll("a"):
      i.attrs['href'] = './'+i.attrs['href'].split('/')[-1]+'.html'
      i.attrs['target'] = ''
    html = '<html lang="en"><meta name="viewport" content="width=device-width, initial-scale=1" /><head><link rel="stylesheet" href="../../init.css"><title>'+titlename+'</title></head><body>'+str(title)+str(post)+'</body></html>'
    with open(folderName+'/html/'+link.split('/')[-1]+'.html','w',encoding='utf8') as f:
        f.write(html)
    return

def fetchArticle(link,folderName):
    r = requests.get(link)
    doc = BeautifulSoup(r.content)
    coverImage = doc.find(class_="article-inline-img-block")
    title = doc.find(class_="row article-header--metadata-title")
    if not title:
      print(link+': No Content')
      return 
    titlename = title.find(class_="f-serif ls-0 article-title pt-2").text
    post = doc.find(class_="article-content-offset")
    coverImageURL = 'https://www.foreignaffairs.com/themes/fa/assets/images/council-on-FA-lockup.png'
    if coverImage:
      if 'data-src' in coverImage.find("img"):
        coverImageURL = coverImage.find("img").attrs['data-src']
        coverImage.find("img").attrs['data-src'] = '../image/' + coverImageURL.split('/')[-1].split('?')[0].replace('%','')
        coverImage.find("img").attrs['src'] = '../image/' + coverImageURL.split('/')[-1].split('?')[0].replace('%','')
    img = requests.get(coverImageURL).content
    coverFile = folderName+'/image/' + coverImageURL.split('/')[-1].split('?')[0].replace('%','')
    with open(coverFile,'wb') as f:
        f.write(img)

    for i in post.findAll("img"):
      if 'data-src' in i:
        url = i.attrs['data-src']
        if './image' not in url:
            img = requests.get(url).content
            imgfile = folderName+'/image/'+ url.split('/')[-1].split('?')[0].replace('%','')
            with open(imgfile,'wb') as f:
                f.write(img)
            i.attrs['src'] = '../image/'+ url.split('/')[-1].split('?')[0].replace('%','')
            i.attrs['data-src'] = '../image/'+ url.split('/')[-1].split('?')[0].replace('%','')
    html = '<html lang="en"><meta name="viewport" content="width=device-width, initial-scale=1" /><head><link rel="stylesheet" href="../../init.css"><title>'+titlename+'</title></head><body>'+str(coverImage)+str(title)+str(post)+'</body></html>'
    with open(folderName+'/html/'+link.split('/')[-1]+'.html','w',encoding='utf8') as f:
        f.write(html)
    return

def fetchDF(year,id):
    url = 'https://www.foreignaffairs.com/issues/{}/{}/{}'.format(year,year-1921,id)
    r = requests.get(url)
    doc = BeautifulSoup(r.content)
    coverURL = doc.find(property="og:image").attrs['content']
    issue = json.loads(doc.find('script', type='application/json').string)
    node = issue['path']['currentPath'].split('/')[-1]
    url = 'https://www.foreignaffairs.com/fa-search.php'
    query = {"query":{"match_all":{}},"from":0,"size":100,"_source":{"includes":["fa_node_primary_image_url__mobile_2x","title","field_display_authors","field_subtitle","path","fa_node_type_or_subtype","nid"]},"post_filter":{"bool":{"must":[{"term":{"field_issue__nid":node}}]}},"sort":[{"field_sequence":"asc"},{"fa_normalized_date":"desc"}]}
    r = requests.post(url,data=json.dumps(query))
    art = json.loads(r.content.decode('utf8'))['hits']['hits']
    df_fa = pd.DataFrame(columns=['nid','fa_node_type_or_subtype','title','field_subtitle','field_display_authors','path','fa_node_primary_image_url__mobile_2x'])
    for i in art:
        item = json.loads(json.dumps(i['_source']).replace('[','').replace(']',''))
        df_fa = df_fa.append(item,ignore_index=True)
    return df_fa

def fetchFA(year,id):

    df_fa = fetchDF(year,id)

    folderName = '{}-{}'.format(year,id)
    if not os.path.isdir(folderName):
      os.makedirs(folderName)
      os.makedirs(folderName+'/html')
      os.makedirs(folderName+'/image')

    comment_li = []
    df_comment = df_fa.loc[df_fa['fa_node_type_or_subtype']!='Capsule Review']
    for i in df_comment.index:
        type = df_comment.fa_node_type_or_subtype[i]
        title = df_comment.title[i]
        desc = df_comment.field_subtitle[i]
        author = df_comment.field_display_authors[i]
        path = folderName+'/html/'+df_comment.path[i].split('/')[-1]+'.html'
        if type != 'Issue Package':
          fetchArticle('https://www.foreignaffairs.com/'+df_comment.path[i],folderName)
        else:
          fetchCover('https://www.foreignaffairs.com/'+df_comment.path[i],folderName)
        imgURL = df_comment.fa_node_primary_image_url__mobile_2x[i]
        if not df_fa.fa_node_primary_image_url__mobile_2x.isna()[i]:
          img = requests.get(imgURL).content
          imgPath = folderName+'/image/'+imgURL.split('/')[-1].split('?')[0].replace('%','')
          with open(imgPath,'wb') as f:
              f.write(img)
        else:
          imgPath = ''
        comment_li.append((type,title,desc,author,path,imgPath))
        print('Fetching: {}'.format(title))
        time.sleep(5)
    cover_title = '{}/{}'.format(year,id)

    HTML = Template("""<!DOCTYPE html>
    <html>
      <meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no" name=viewport>
      <meta charset=utf-8>
      <link rel="stylesheet" href="../index.css">
      <title>${cover_title}</title>
    </head>

    <body>
    <div class="magazine-list">
      <h2 class="cover-story">${cover_title}</h2>
      <div class="magazine-article-container">

      %for type,title,desc,author,path,imgPath in comment_li:
        <a class="article-card" href=${path}>
          <div class="article-image">
            <img class="col-4" src=${imgPath}>
          </div>
          <div class="article">
            <h3 class="article-title">${type}: ${title}</h3>
            <p class="article-desc">${desc}</p>
            <h4 class="article-author">${author}</h4>
          </div>
        </a>
      %endfor
      </div>
    </body>

    </html>
    """)

    with open(folderName+'.html','w',encoding='utf8') as f:
        f.write(HTML.render(cover_title=cover_title,comment_li=comment_li))

In [100]:
for year in range(2010,2021):
    fetchFA(year,5)
    time.sleep(10)

Fetching: Out of Order
Fetching: Smaller and Safer
Fetching: Beyond Moderates and Militants
Fetching: Bringing Israel's Bomb Out of the Basement 
Fetching: How to Handle Hamas
Fetching: Staying Power
Fetching: Russia's New Nobility
Fetching: Defending a New Domain
Fetching: Not Ready for Prime Time
Fetching: An Unlikely Trio
Fetching: Hydraulic Pressures
Fetching: Interdependency Theory
Fetching: Send in the Civilians
Fetching: Law for the Global Poor
Fetching: A NATO Red Carpet for Moscow
Fetching: Islamism, Unveiled
Fetching: Will Oil Drown the Arab Spring?
Fetching: Europe's Palestine Problem
Fetching: A New Kind of Korea
Fetching: Al Qaeda’s Challenge
Fetching: September 11 in Retrospect
Fetching: Leaving Afghanistan to the Afghans
Fetching: Afghanistan's Ethnic Puzzle
Fetching: The Inevitable Superpower
Fetching: The Middling Kingdom
Fetching: Surgical Strikes in the Drug Wars
Fetching: Palestine Goes to the UN
Fetching: The Unbreakable Muslim Brotherhood
Fetching: Commanding Demo

In [64]:
df_fa = fetchDF(2009,2)

In [58]:
fetchArticle('https://www.foreignaffairs.com/articles/libya/2012-09-16/after-qaddafi','2012-6')

https://www.foreignaffairs.com/articles/libya/2012-09-16/after-qaddafiNo Content


In [6]:
HTML = Template("""
<!DOCTYPE html>
<html>
<meta content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no" name=viewport>
<meta charset=utf-8>
<link rel="stylesheet" href="./index.css">
<title>Archive</title>
</head>

<body>
    <div class="magazine-list">
        <h2 class="cover-story">Archive</h2>
        <div class="magazine-article-container">
            %for year in year_list:
                <div class="article">
                <h3 class="article-title">${year}</h3>
                <a class="article-author" href="archive/${year}-1.html">1-2</a>
                <a class="article-author" href="archive/${year}-2.html">3-4</a>
                <a class="article-author" href="archive/${year}-3.html">5-6</a>
                <a class="article-author" href="archive/${year}-4.html">7-8</a>
                <a class="article-author" href="archive/${year}-5.html">9-10</a>
                <a class="article-author" href="archive/${year}-6.html">11-12</a>                                
            </div>
            %endfor
        </div>
    </div>
</body>

</html>
    """)
year_list = list(range(2000,2021))
with open('../archive.html','w',encoding='utf8') as f:
    f.write(HTML.render(year_list=year_list))

[2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]