In [2]:
import re
from pathlib import Path
import pandas as pd

# Read dataset of articles coveriing the war in Syria
data = pd.read_csv(Path(".")/"data"/"FA-KES-Dataset.csv", encoding="latin")

# Sanitize beginnings of content strings to get rid of publication dates
# and other metadata. This is done using a few regular expressions
#
# Get rid of publication dates
pattern1 = r"^(" \
    r"\s*([A-Z]\w+)?\s*\d{1,2}\s*([A-Z]\w+)?\s*\d{2,4}\s*" \
    r"|" \
    r"\d{1,2}[-\.]\d{1,2}[-\.]\d{2,4}\s*" \
    r")" \
    r"(\s*at\s*\d+:?\d+)?"
# Get rid of update dates
pattern2 = r"^\s*(\(updated\s*[\w\d]*\s*\))\s*"
# Get rid of get url shout outs
pattern3 = r"^\s*get\s*short\s*url\s*([\d\w]+\s*)*"

# Sanitize the content column
data["article_content"] = data["article_content"] \
    .str.replace(pattern1, r"", regex=True) \
    .str.replace(pattern2, r"", regex=True, flags=re.IGNORECASE) \
    .str.replace(pattern3, r"", regex=True, flags=re.IGNORECASE) \
    .str.strip()

In [3]:
data

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,1914947530,Syria attack symptoms consistent with nerve ag...,Syria attack symptoms consistent with nerve ag...,nna,4/5/2017,idlib,0,,,,
1,1914947532,Homs governor says U.S. attack caused deaths b...,Homs governor says U.S. attack caused deaths b...,nna,4/7/2017,homs,0,,,,
2,1914947533,Death toll from Aleppo bomb attack at least 112,Death toll from Aleppo bomb attack at least 11...,nna,4/16/2017,aleppo,0,,,,
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Aleppo bomb blast kills six Syrian state TV. A...,nna,4/19/2017,aleppo,0,,,,
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,29 Syria Rebels Dead in Fighting for Key Alepp...,nna,7/10/2016,aleppo,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...
799,1965511221,Turkish Bombardment Kills 20 Civilians in Syria,Turkish Bombardment Kills 20 Civilians in Syri...,manar,8/28/2016,aleppo,1,,,,
800,1965511222,Martyrs as Terrorists Shell Aleppos Salah Eddin,Martyrs as Terrorists Shell Aleppos Salah Eddi...,manar,8/1/2016,aleppo,1,,,,
801,1965511224,Chemical Attack Kills Five Syrians in Aleppo SANA,Chemical Attack Kills Five Syrians in Aleppo S...,manar,8/3/2016,aleppo,0,,,,
802,1965511226,5 Killed as Russian Military Chopper Shot down...,5 Killed as Russian Military Chopper Shot down...,manar,8/1/2016,idlib,1,,,,
