In [1]:
import json
import pickle
import PyPDF2
import html2text
import random
import requests
import wikipediaapi
import pandas as pd
from io import BytesIO
from tqdm import tqdm
import idna
from bs4 import BeautifulSoup, Comment
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional, List, Dict, Tuple, Union, Any, Callable
from urllib.parse import urlparse, quote
import socket
import concurrent

from IPython.display import display, HTML, Markdown

pd.set_option('display.precision', 5)

In [2]:
# load all questions
path = "../benchmark/data/autocast/autocast_questions.json"
df = pd.read_json(path)
print(df.shape)

# filter out non-true/false questions
df = df[df["qtype"] == "t/f"].reset_index(drop=True)
print(df.shape)

# make sure answers is not None
df = df[df["answer"].notnull()].reset_index(drop=True)
print(df.shape)

# make sure source_links is not []
df = df[df["source_links"].map(len) > 0].reset_index(drop=True)
print(df.shape)

(6532, 14)
(3225, 14)
(2003, 14)
(1403, 14)


In [3]:
# number of links per question
df['num_links'] = df['source_links'].apply(lambda x: len(x))

In [4]:
all_questions = [q for questions in df["source_links"] for q in questions if q not in ["", None]]
all_questions = list(set(all_questions))
len(all_questions)

47357

### Analyse the urls

In [None]:
# count number of pdfs in all questions
pdf_count = 0
for q in all_questions:
    if q.lower().endswith(".pdf"):
        pdf_count += 1

print(f"Number of PDFs: {pdf_count}")

In [None]:
# count number of wikipedia in all questions
wiki_count = 0
for q in all_questions:
    if "wikipedia" in q.lower():
        wiki_count += 1

print(f"Number of Wikipedia: {wiki_count}")

In [None]:
from collections import Counter

domains = Counter()
for q in all_questions:
    parts = q.split("/")
    # Ensure there are enough parts in the URL to extract the domain
    if len(parts) > 2:
        domain = parts[2]
        if domain not in domains:
            domains[domain] = 0
        domains[domain] += 1

In [None]:
# print top 30 domains with counts
domains.most_common(30)

### Extractions

In [5]:
def get_hostname(url: str) -> str:
    """Extracts and validates the hostname from a URL."""
    if not url:
        return False

    try:
        parsed_url = urlparse(url)
        if not parsed_url.scheme:
            return False

        hostname = parsed_url.hostname
        if not hostname:
            return False

        # Check for consecutive periods in the hostname
        if '..' in hostname:
            return False

        # IDNA encoding for internationalized domain names
        hostname = idna.encode(hostname).decode('ascii')

        if len(hostname) > 255:
            return False

        return True

    except Exception as e:
        return False

    

def extract_text(html: str, num_words: Optional[int] = None) -> str:
    """Extract text from a single HTML document using html2text, removing scripts, nav, header, and footer."""
    soup = BeautifulSoup(html, "html.parser")

    # Remove unwanted elements
    for element in soup([
        "script", "style", "nav", 
        "header", "footer", "form", 
        "iframe", ".navbar", ".menu", 
        ".breadcrumb", ".pagination", ".nav",
        ".ad", ".sidebar", ".popup", ".modal",
        ".social-icons", ".hamburger-menu",
    ]):
        element.decompose()

    # Remove comments
    for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Convert to text using html2text
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.ignore_emphasis = True
    text = h.handle(str(soup))

    # Trim to a specific number of words if required
    if num_words:
        words = text.split()[:num_words]
        return ' '.join(words)

    return text


def extract_from_wikipedia(url: str, num_words: Optional[int] = None) -> Dict[str, Any]:
    """Extract full text from a Wikipedia page using the wikipedia-api wrapper."""

    wiki = wikipediaapi.Wikipedia(user_agent='MyWikiExtractor/1.0 (example@gmail.com)')

    try:
        # Extract the page title from the URL
        title = url.split("/")[-1]

        page = wiki.page(title)
        if page.exists():
            # Use 'text' property to get full content of the page
            text = page.text

            # Optionally, trim the text
            if num_words:
                words = text.split()[:num_words]
                text = ' '.join(words)

            return {"url": url, "error": False, "error_message": None, "text": text}
        else:
            return {"url": url, "error": True, "error_message": "Wikipedia page does not exist", "text": None}
    except Exception as e:
        return {"url": url, "error": True, "error_message": str(e), "text": None}


def extract_text_from_pdf(url: str, num_words: Optional[int] = None) -> Dict[str, Any]:
    """Extract text from a PDF document at the given URL."""
    try:
        response = requests.get(url, timeout=HTTP_TIMEOUT)
        response.raise_for_status()
        
        if 'application/pdf' not in response.headers.get('Content-Type', ''):
            return ValueError("URL does not point to a PDF document")
        
        with BytesIO(response.content) as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            # Optionally, trim the text to the specified number of words
            if num_words:
                words = text.split()[:num_words]
                text = ' '.join(words)

            print(f"Extracted {len(text)} words from {url}")
            return {"url": url, "error": False, "error_message": None, "text": text}
    
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return {"url": url, "error": True, "error_message": str(e), "text": None}


def get_text(url: str, num_words: Optional[int] = None) -> Dict[str, Any]:
    """Get the content of a URL and extract text. Handles both HTML and PDF."""
    print(f"Extracting text from {url}")
    hostname = get_hostname(url)
    if not hostname:
        return {"url": url, "error": True, "error_message": "Invalid hostname", "text": None}

    filter_words = [
        "facebook", "twitter", "youtube",
        "instagram", "pinterest", "linkedin", "bloomberg",
    ]
    if any(word in url.lower() for word in filter_words):
        print(f"URL filtered: {url}")
        return {"url": url, "error": True, "error_message": "URL filtered", "text": None}
    
    # Extract text from pdf
    if url.lower().endswith(".pdf"):
        return extract_text_from_pdf(url, num_words)
    
    # Extract text from wikipedia
    if "wikipedia" in url.lower():
        return extract_from_wikipedia(url, num_words)
    
    # Extract text from HTML
    try:
        response = requests.get(url, timeout=HTTP_TIMEOUT)
        response.raise_for_status()
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return {"url": url, "error": True, "error_message": str(e), "text": None}

    text = extract_text(response.text, num_words=num_words)
    print(f"Extracted {len(text)} words from {url}")
    return {"url": url, "error": False, "error_message": None, "text": text}

##### PDFs

In [None]:
pdfs = [q for q in all_questions if q.lower().endswith(".pdf")]
len(pdfs)

In [None]:
# # Extract text from PDFs for 10 
# pdf_texts = []
# with tqdm(total=10) as pbar:
#     for url in pdfs:
#         try:
#             if pbar.n >= 10:
#                 break
#             result = extract_text_from_pdf(url)
#             if not result["error"]:
#                 pdf_texts.append(result["text"])
#                 pbar.update(1)

#         except Exception as e:
#             print(f"Error: {e}")

In [None]:
# display(HTML(pdf_texts[9]))

##### Wikis

In [None]:
wikis = [q for q in all_questions if "wikipedia" in q.lower()]

In [None]:
# Extract text from Wikipedia for 100
n =5
wiki_texts = []
with tqdm(total=n) as pbar:
    for url in wikis:
        if pbar.n >= n:
            break
        result = extract_from_wikipedia(url)
        if not result["error"]:
            wiki_texts.append(result["text"])
            pbar.update(1)

In [None]:
# display(HTML(wiki_texts[2]))

##### Reuters

In [None]:
# get all reuters links
reuters_links = []
for q in all_questions:
    if "reuters.c" in q.lower():
        reuters_links.append(q)

In [None]:
def check_links(links: List[str]) -> List[str]:
    """Check if links are valid."""
    links_200 = []
    links_non200 = []

    with tqdm(total=len(links)) as pbar:
        for url in links:
            try:
                response = requests.get(url, timeout=5)
                if response.status_code == 200:
                    links_200.append(url)
                else:
                    links_non200.append(url)
            except requests.RequestException as e:
                links_non200.append(url)
            pbar.update(1)

    return links_200, links_non200

In [None]:
n_urls = 100
filter = ['facebook', 'twitter', 'youtube', 'wikipedia', 'bit.ly']
results = {}

for domain, count in tqdm(domains.most_common(30), total=30, desc="Domains"):
    if any(word in domain.lower() for word in filter):
        continue

    links = [q for q in all_questions if domain in q.lower()]
    links_200, links_non200 = check_links(links[:n_urls])
    results[domain] = {"links_200": links_200, "links_non200": links_non200}

### Extractions

In [7]:
def process_urls_chunk(urls_chunk):
    """Process a chunk of URLs and return their processed data."""
    processed_data = []
    for url in urls_chunk:
        processed_result = get_text(url)
        processed_data.append(processed_result)
    return processed_data

def save_progress(data, filename):
    """Save the processed data to a file."""
    with open(filename, 'ab') as file:
        pickle.dump(data, file)

def process_urls_concurrently(urls, chunk_size=1000, max_workers=10):
    for i in range(0, len(urls), chunk_size):
        urls_chunk = urls[i:i + chunk_size]
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future = executor.submit(process_urls_chunk, urls_chunk)
            processed_data = future.result()
            save_progress(processed_data, f'processed_pdfs_{i // chunk_size}.pickle')


In [None]:
HTTP_TIMEOUT = 60

pdfs = [q for q in all_questions if q.lower().endswith(".pdf")]
len(pdfs)

In [8]:
process_urls_concurrently(pdfs)

Extracting text from https://www.nrdc.org/sites/default/files/euro.pdf
Extracted 258690 words from https://www.nrdc.org/sites/default/files/euro.pdf
Extracting text from https://www.supremecourt.gov/oral_arguments/argument_transcripts/2020/20-512_g314.pdf
Extracted 208712 words from https://www.supremecourt.gov/oral_arguments/argument_transcripts/2020/20-512_g314.pdf
Extracting text from http://www.oni.navy.mil/Portals/12/Intel%20agencies/russia/Russia%202015print.pdf
Error extracting text from http://www.oni.navy.mil/Portals/12/Intel%20agencies/russia/Russia%202015print.pdf: 403 Client Error: Forbidden for url: http://www.oni.navy.mil/Portals/12/Intel%20agencies/russia/Russia%202015print.pdf
Extracting text from http://arxiv.org/pdf/1509.03622v1.pdf
Extracted 77881 words from http://arxiv.org/pdf/1509.03622v1.pdf
Extracting text from http://www.worldairops.com/ASI/docs/ASI_MAP_ATSRoutesUpper_atWorldAirOps.com.pdf
Error extracting text from http://www.worldairops.com/ASI/docs/ASI_MAP_A

FloatObject (b'0.00-6051436') invalid; use 0.0 instead
FloatObject (b'0.00-8503398') invalid; use 0.0 instead


Extracted 63856 words from https://arxiv.org/pdf/1605.07685.pdf
Extracting text from https://issafrica.s3.amazonaws.com/site/uploads/ear8.pdf
Extracted 66142 words from https://issafrica.s3.amazonaws.com/site/uploads/ear8.pdf
Extracting text from http://reliefweb.int/sites/reliefweb.int/files/resources/ukraine_-_issue_11_eng.pdf
Extracted 19275 words from http://reliefweb.int/sites/reliefweb.int/files/resources/ukraine_-_issue_11_eng.pdf
Extracting text from https://scholarsbank.uoregon.edu/xmlui/bitstream/handle/1794/22105/833.pdf
Extracted 85673 words from https://scholarsbank.uoregon.edu/xmlui/bitstream/handle/1794/22105/833.pdf
Extracting text from https://governor.hawaii.gov/wp-content/uploads/2021/09/2109007-ATG_Executive-Order-No.-21-06-distribution-signed.pdf
Error extracting text from https://governor.hawaii.gov/wp-content/uploads/2021/09/2109007-ATG_Executive-Order-No.-21-06-distribution-signed.pdf: 404 Client Error: Not Found for url: https://governor.hawaii.gov/wp-content/u

In [10]:
all_questions_non_pdf = [q for q in all_questions if not q.lower().endswith(".pdf")]

In [12]:
process_urls_concurrently(all_questions_non_pdf)

Extracting text from https://en.wikipedia.org/wiki/List_of_Prime_Ministers_of_South_Korea
Extracting text from http://www.thedetroitbureau.com/2016/10/second-daimler-plant-to-yield-400-increase-in-battery-production/


  for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):


Extracted 4350 words from http://www.thedetroitbureau.com/2016/10/second-daimler-plant-to-yield-400-increase-in-battery-production/
Extracting text from https://twitter.com/ThomasErdbrink/status/640886110181134339
URL filtered: https://twitter.com/ThomasErdbrink/status/640886110181134339
Extracting text from http://www.marketwatch.com/story/november-jobs-report-likely-to-give-fed-go-ahead-to-raise-interest-rates-2015-11-29
Extracted 6045 words from http://www.marketwatch.com/story/november-jobs-report-likely-to-give-fed-go-ahead-to-raise-interest-rates-2015-11-29
Extracting text from http://russia-insider.com
Error extracting text from http://russia-insider.com: 503 Server Error: Service Unavailable for url: https://russia-insider.com/
Extracting text from https://www.wsj.com/articles/irs-bank-reporting-democrats-11634658560
Error extracting text from https://www.wsj.com/articles/irs-bank-reporting-democrats-11634658560: 403 Client Error: Forbidden for url: https://www.wsj.com/articles

  soup = BeautifulSoup(html, "html.parser")


Extracted 63 words from http://hosted.ap.org/dynamic/stories/M/ML_IRAQ_MOSUL?SITE=AP&amp;SECTION=HOME&amp;TEMPLATE=DEFAULT&amp;CTIME=2016-03-06-03-23-16
Extracting text from http://arxiv.org/abs/1507.04383
Extracted 4423 words from http://arxiv.org/abs/1507.04383
Extracting text from http://www.investopedia.com/news/inverted-yield-curve-guide-recession/
Error extracting text from http://www.investopedia.com/news/inverted-yield-curve-guide-recession/: 405 Client Error: Signal - Not Acceptable for url: http://www.investopedia.com/news/inverted-yield-curve-guide-recession/
Extracting text from http://www.nbcnews.com/news/latino/venezuela-could-go-way-cuba-say-u-s-congressional-members-n784566
Error extracting text from http://www.nbcnews.com/news/latino/venezuela-could-go-way-cuba-say-u-s-congressional-members-n784566: 403 Client Error: Forbidden for url: http://www.nbcnews.com/news/latino/venezuela-could-go-way-cuba-say-u-s-congressional-members-n784566
Extracting text from https://www.c

FloatObject (b'0.00-6051436') invalid; use 0.0 instead
FloatObject (b'0.00-8503398') invalid; use 0.0 instead


Extracted 63856 words from https://arxiv.org/pdf/1605.07685.pdf
Extracting text from http://www.springer.com/us/book/9783540570745
Extracted 5880 words from http://www.springer.com/us/book/9783540570745
Extracting text from https://www.cms.gov/files/document/cms-omnibus-covid-19-health-care-staff-vaccination-requirements-2021.pdf
Extracting text from https://www.cms.gov/files/document/cms-omnibus-covid-19-health-care-staff-vaccination-requirements-2021.pdf
Error extracting text from https://www.cms.gov/files/document/cms-omnibus-covid-19-health-care-staff-vaccination-requirements-2021.pdf: 403 Client Error: Forbidden for url: https://www.cms.gov/files/document/cms-omnibus-covid-19-health-care-staff-vaccination-requirements-2021.pdf
Extracting text from http://www.scmp.com/news/china/policies-politics/article/2116411/seasoned-duo-tipped-take-key-jobs-communist-party
Extracted 6326 words from http://www.scmp.com/news/china/policies-politics/article/2116411/seasoned-duo-tipped-take-key-jo

In [5]:
with open("retrieved_docs_v2.pkl", "rb") as f:
    retrieved_docs = pickle.load(f)

In [6]:
docs_df = pd.DataFrame(retrieved_docs).reset_index(drop=True)
docs_df.shape

(47297, 4)

In [7]:
docs_df_error = docs_df[docs_df["error"] == True].reset_index(drop=True)
docs_df_error.shape

(19399, 4)

In [8]:
docs_df_no_error = docs_df[docs_df["error"] == False].reset_index(drop=True)
docs_df_no_error.shape

(27898, 4)

In [9]:
def count_no_error_links(row):
    count = 0
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            count += 1

    return count

df["num_no_error_links"] = df.apply(count_no_error_links, axis=1)

In [10]:
df["num_no_error_links"].value_counts().sort_index()[3:].sum()

880

In [11]:
df["num_no_error_links"].value_counts().sort_index()[3:20]

num_no_error_links
3     150
4     130
5      83
6      49
7      33
8      23
9      13
10      9
11      4
12      8
13      4
14      8
15      5
16     12
17      7
18      8
19     10
Name: count, dtype: int64

In [12]:
df["num_no_error_links"].value_counts().sort_index()[5:20].sum()

276

In [13]:
docs_df_no_error['num_words'] = docs_df_no_error['text'].apply(lambda x: len(x.split()))

In [14]:
docs_df_no_error['num_words'].quantile([0.25, 0.50, .75, .9, .95, .99])

0.25      500.00
0.50      912.00
0.75     1521.00
0.90     3017.90
0.95     5755.15
0.99    16928.57
Name: num_words, dtype: float64

In [15]:
# count number of source links that are more than n words
def count_num_words(row, n):
    count = 0
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            num_words = docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0]
            if num_words > n:
                count += 1

    return count

df["num_words_1000"] = df.apply(lambda x: count_num_words(x, 1000), axis=1)

In [16]:
df["num_words_1000"].value_counts().sort_index()[3:].sum()

613

In [17]:
df["num_words_1000"].value_counts().sort_index()[5:].sum()

421

### Making final dataset

we filter out the following
- non 200 codes
- urls with certain keywords that were identified during manual checking
- use of html2text
- use pdf extractor for pdfs
- use wikipedia api for wikis
- and finally, filter links that had less than 1000 words
- the final dataset will have a minimum of 5 and maximum of 20 source_links 

In [18]:
final_df = df.copy()

In [19]:
# keep only questions with at least 5 links and more than 1000 words
final_df = final_df[final_df["num_words_1000"] >= 5].reset_index(drop=True)

In [20]:
# source_links remove links that are not in docs_df_no_error and keep only links that have more than 1000 words
def filter_source_links(row):
    links = []
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            num_words = docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0]
            if num_words > 1000:
                links.append(link)

    return links

final_df["source_links"] = final_df.apply(filter_source_links, axis=1)

In [21]:
# verify that all links are in docs_df_no_error and have more than 1000 words
for links in final_df["source_links"]:
    for link in links:
        assert link in docs_df_no_error["url"].values
        assert docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0] > 1000

In [22]:
# from final_df remove added columns
final_df = final_df.drop(columns=["num_links", "num_no_error_links", "num_words_1000"])

# convert to json
final_df.to_json("olas_benchmark_v2.json", orient="records")

In [23]:
# from docs_df_no_error keep only urls that are in final_df
docs_df_no_error = docs_df_no_error[docs_df_no_error["url"].isin(final_df["source_links"].explode().unique())].reset_index(drop=True)

# drop error, error_message, num_words
docs_df_no_error = docs_df_no_error.drop(columns=["error", "error_message", "num_words"])


In [24]:
# save as pickle
with open("olas_docs_v2.pkl", "wb") as f:
    pickle.dump(docs_df_no_error, f)

### Verify saved files

In [25]:
import json

with open("olas_benchmark_v2.json", "r") as f:
    data = json.load(f)

In [26]:
data[1]

{'question': 'Will the Export-Import Bank of the United States be re-authorized before 1 January 2016?',
 'id': 'G5',
 'background': "The Export-Import Bank's authorization expired on 1 July, but proponents of the bank are working to get it re-authorized (http://www.nytimes.com/2015/07/01/business/international/though-charter-is-expiring-export-import-bank-will-keep-its-doors-open.html , http://www.nytimes.com/2015/07/06/us/politics/us-export-import-bank-teetering-on-edge.html , http://thehill.com/policy/finance/247953-house-gop-draws-first-in-ex-im-showdown ). Legislation re-authorizing the bank must be signed into law by the President before taking effect.",
 'publish_time': 1441116141242,
 'close_time': '2015-12-04 14:00:25+00:00',
 'tags': ['Economic Policy', 'US Politics', 'US Policy'],
 'source_links': ['http://www.hartfordbusiness.com/article/20151005/NEWS01/310029963',
  'http://auburnpub.com/blogs/eye_on_ny/schumer-highway-bill-will-include-export-import-bank-reauthorization/a

In [27]:
with open("olas_docs_v2.pkl", "rb") as f:
    docs = pickle.load(f)

In [28]:
docs

Unnamed: 0,url,text
0,http://edition.cnn.com/2017/03/03/opinions/tru...,Ad Feedback\n\n# Why Trump won’t tear up Iran...
1,https://www.washingtonpost.com/national-securi...,clockThis article was published more than 2 ye...
2,http://www.cnn.com/2017/02/15/politics/trump-r...,Ad Feedback\n\nVideo Ad Feedback\n\nTrump: Ses...
3,https://www.state.gov/t/avc/trty/102360.htm#text,Skip to Main Content\n\nJump to In This Sectio...
4,http://www.bild.de/politik/inland/jamaika-koal...,Weiter zum Hauptinhalt↵\n\n## ++ Alle aktuelle...
...,...,...
11479,https://www.washingtonpost.com/outlook/2021/04...,clockThis article was published more than 2 ye...
11480,http://www.the-american-interest.com/2015/12/0...,NATO Moves Forward with Montenegro - The Ameri...
11481,https://www.cbsnews.com/news/fbi-agents-visit-...,Watch CBS News\n\nUpdated 3:19 p.m. ET\n\nFBI ...
11482,http://www.dw.com/de/live-ticker-vom-sonderpar...,You need to enable JavaScript to run this app....


In [32]:
display(Markdown(docs.loc[11037, "text"]))

Skip to main contentSkip to navigationSkip to navigation

  * World
  * Europe
  * US
  * Americas
  * Asia
  * Australia
  * Middle East
  * Africa
  * Inequality
  * Global development

US and Russia announce agreement on Syria ceasefire Guardian

Syria

This article is more than 7 years old

Analysis

# US-Russia agreement full of snags but best hope for peace in Syria

This article is more than 7 years old

Patrick Wintour Diplomatic editor

Staged ceasefire and reopening of aid are good starting points but
complexities over forces opposed to Assad’s regime remain

Sat 10 Sep 2016 14.13 BSTLast modified on Tue 31 Aug 2021 15.20 BST

  *   *   * 

The Syrian counter-terror agreement is ambitious, full of pitfalls and the
best hope for a resumption of the ceasefire and peace talks in Syria.

The kernel of the agreement, reached after 13 hours of talks in Geneva on
Friday, is a staged ceasefire, a reopening of humanitarian aid, followed by a
grounding of the Syrian air force in those areas dominated by opposition
fighters recognised by the west. A specific plan has been set out on how to
deliver aid to the 250,000 citizens of Aleppo who are running out of water and
fuel.

In return the west will coordinate with Russia not just attacks on the forces
of Islamic State in north-west Syria, but also the al-Qaida-linked and
recently rebranded Jabhat al-Nusra.

US and Russia reach tentative agreement for Syria ceasefire

Read more

For Russia’s foreign minister, Sergei Lavrov, the responsibility in the
agreement lies in requiring the Syrian air force to ground itself over its
sovereign territory, and in ensuring humanitarian aid – too often callously
blocked by the Syrian army checkpoints – is allowed to flow again.

President Bashar al-Assad sees no distinction between the opposition forces,
regarding them all as terrorists opposed to his regime. This agreement
requires him to change that mindset.

For the United States, there is a responsibility to require the Washington-
backed Syrian opposition to disentangle themselves militarily, politically and
even physically from Jabhat al-Nusra.

In practice, there has been a marbling between al-Nusra and Washington-backed
fighting forces as they unite against the military advance of Assad.

The Pentagon, and some in the state department, are doubtful that Russia has
the means or the determination to control the Syrian air force. The new
ceasefire is due to start 12 September, and they fear the US Secretary of
State, John Kerry, has miscalculated.

The Washington-based Syria Institute said: “While Lavrov mentioned that they
agreed on procedures on responding to any breaches or violations of the
cessation of hostilities, no details were given. The lack of enforcement or
compliance measures in previous agreements has been a key contributor to their
failure.”

“Plans do not implement themselves,” Kerry pointed out. Details of how the
agreement will be enforced, currently being kept private in five separate
chapters, include the exchanges of intelligence, the maps delineating the
precise distribution of opposition forces, and the sanctions for ceasefire
breaches.

Lavrov insists he won the agreement of Assad in what they will see as the
first east-west agreement to defeat not just Isis in Syria, but other jihadi
groups. Russia has long sought US cooperation in this anti-terror battle.

Apart from the sincerity of Syrian cooperation, the second biggest question is
whether western-backed Syrian fighters will disassociate themselves from al-
Nusra.

Al-Nusra Front cuts ties with al-Qaida and renames itself

Read more

Kerry was unambiguous. “Going on al-Nusra is not a concession to anybody” but
“is profoundly in the interests of the US,” he said

Bassma Kodmani, a senior figure in the main opposition body the High
Negotiation Committee (HNC), insisted the marbling will end, and the cessation
of hostilities will allow for the extremists’ influence to be marginalised.
She said: “When the cessation of hostilities was installed in February, the
opposition – 100 groups – respected it. It was violated by the regime. So a
return to a cessation of hostilities has been our demand. We are absolutely in
favour of it.”

Asked if opposition fighters will separate themselves from extremists, Kodmani
said: “In the cessation in February, when our groups committed to it, the
extremists were marginalised. They did not dare to challenge it. Since then
opposition forces and extremists have been forced together under siege.

“So the key is ending Assad’s strategy of surrounding whole areas and
besieging them. The moderate groups will reorganise and distance themselves
from the radical groups. We will do our part.”

But Charles Lister, a senior fellow at the Middle East Institute and expert on
the Syrian jihadis, was more dubious. He wrote: “Having spoken with leadership
figures from several dozen armed factions in recent weeks, I can say that not
a single one has suggested any willingness to withdraw from frontlines on
which JFS is present. To them, doing so means effectively ceding territory to
the regime, as they have little faith in a long-term cessation of hostilities
holding.”

But he added: “The armed opposition in Syria now faces what is perhaps its
biggest and most momentous decision since they chose to take up arms against
the Assad regime in 2011. There is no hiding the fact that mainstream
opposition forces are extensively “marbled” or “coupled” with JFS forces on
frontlines from Deraa in the south, to Damascus and throughout the north-west
of the country.”

One best option is that al-Nusra forces will withdraw knowing that to do
otherwise would hinder the peace process, and lose fragile popular support.
But the next few days will be a big test of the HNC influence on the ground.

The man with the toughest job in the world | Janine di Giovanni

Read more

There is much that is absent from the agreement, including any commitments on
the release of political detainees or any promise by either side to change
their stance on the legitimate place for Assad in a future Syria. The previous
peace talks designed to map out a transition to a new government did not
really travel much past first base, mainly due to the Syrian opposition demand
that Assad leave within six months.

The two sides did not even meet face to face, preferring to trade insults at
sporadic press conferences, as the genteel UN special envoy Staffan de Mistura
manfully looked for chinks of light amid the uniform darkness.

Since then, and only this week, the HNC set out a coherent road map to a new
Syria that is recognisably democratic and does not assume anyone connected
with the Assad regime will have to stand aside. It is a plausible alternative
for the future of Syria, and one in which Russian influence is not eradicated.

But in the entrails of the Kerry Lavrov press conference, Lavrov highlighted
the problems ahead in the peace talks, pointing out that the HNC cannot be
seen as the sole negotiating body. He pointed to other groups – the Moscow and
Cairo group – as requiring equal status in any peace talks. The HNC is seen as
a creature of Saudi Arabia by the Russians, and representative mainly of
Riyadh. So if talks do resume, the basics of the attendees may have to be
revisited.

There are at least two further difficulties. The Kurds have a nominal role in
the HNC, but its chief representatives are excluded. There was also no sign of
Iran in these peace talks, yet they have militia fighting on the ground in
Aleppo and elsewhere. Iran and Moscow interests do not fully elide in Syria.

So no one is pretending after so many setbacks that a corner has been turned,
but at least it is possible to see if it can be reached.

Explore more on these topics

  * Syria
  * Middle East and north Africa
  * Bashar al-Assad
  * Islamic State
  * John Kerry
  * Russian presidential election 2012
  * Russia
  * analysis

  *   *   *   *   *   * 

Reuse this content

## Most viewed

## Most viewed

  * World
  * Europe
  * US
  * Americas
  * Asia
  * Australia
  * Middle East
  * Africa
  * Inequality
  * Global development

