In [44]:

from collections import Counter
import pandas as pd
import re


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### Merging UNHCR and Guardian data

Pre-requisite files:

For Syria:
- syria_guardian_final.csv
- syria_unhcr_file.csv

For Ukraine:
- ukraine_guardian_final.csv
- ukraine_unhcr_file.csv

Same code used for Syria and Ukraine, respectively

Size after the whole cleaning and corpus creation process:

Syria: 20894  
Ukraine: 13554

In [27]:
# Load files
df1 = pd.read_csv('/content/drive/MyDrive/code_file/preprocessing/syria_guardian_final.csv')
df2 = pd.read_csv("/content/drive/MyDrive/code_file/preprocessing/syria_unhcr_file.csv")

# Rename columns in df1 to match columns in df2
df1 = df1.rename(columns={'publication_date_fixed': 'Date', 'webTitle': 'Headline', 'body': 'merged_text'})

# Select columns from df1 that match the desired columns in the output file
df1 = df1[['Date', 'Headline', 'id', 'merged_text']]

# Load df2 
df2 = df2[['Date', 'Headline', 'merged_text']]

# Concatenate the two dataframes vertically
output_df = pd.concat([df1, df2])

# Convert the 'Date' column to datetime format
output_df['Date'] = pd.to_datetime(output_df['Date'], errors='coerce')

# Create a new 'id' column by concatenating the 'Date' and 'Headline' columns
output_df['id_new'] = output_df['Date'].dt.strftime('%Y-%m-%d') + output_df['Headline']

# Remove spaces from the 'id' column
output_df['id_new'] = output_df['id_new'].str.replace(' ', '')

# Remove the old 'id' column
output_df.drop(columns=['id'], inplace=True)

# Rename the new 'id' column to 'id'
#output_df.rename(columns={'id_new': 'id'}, inplace=True)

# Print the original number of rows and columns for df1 and df2
print("Original number of rows and columns for df1:")
print(df1.shape)
print("Original number of rows and columns for df2:")
print(df2.shape)

# Print the number of rows and columns for the output dataframe
print("Number of rows and columns for the output dataframe:")
print(output_df.shape)

# Fill in missing values with an empty string
output_df.fillna('', inplace=True)

  df2 = pd.read_csv("/content/drive/MyDrive/code_file/preprocessing/syria_unhcr_file.csv")


Original number of rows and columns for df1:
(22607, 4)
Original number of rows and columns for df2:
(712404, 3)
Number of rows and columns for the output dataframe:
(735011, 4)


In [28]:
output_df.head()

Unnamed: 0,Date,Headline,merged_text,id_new
0,2022-08-23,Turkey’s rapprochement with Syria leaves regio...,<p>Syrians across the opposition and in the Ku...,2022-08-23Turkey’srapprochementwithSyrialeaves...
1,2023-01-20,US state department announces new refugee prog...,<p>The US state department has announced a new...,2023-01-20USstatedepartmentannouncesnewrefugee...
2,2023-01-26,Children go hungry at Kenya refugee camp as ma...,<p>Malnutrition among children in one of the w...,2023-01-26ChildrengohungryatKenyarefugeecampas...
3,2022-03-07,"Syria, Afghanistan, Ukraine: how many refugee ...","<p>The province of Van, in the east of Turkey,...","2022-03-07Syria,Afghanistan,Ukraine:howmanyref..."
4,2022-11-17,Firefighter arrested in Germany over refugee s...,<p>A fire that tore through a shelter for Ukra...,2022-11-17FirefighterarrestedinGermanyoverrefu...


In [29]:
output_df.count()

Date           735011
Headline       735011
merged_text    735011
id_new         735011
dtype: int64

# Text Cleaning

In [30]:
df = output_df

In [31]:
# Remove duplicates
df.drop_duplicates(subset=['merged_text'], inplace=True)
df.drop_duplicates(subset=['Headline'], inplace=True)


In [34]:
# Remove rows with NaN values in 'merged_text' column
df.dropna(subset=['merged_text'], inplace=True)

# Remove rows with NaN values in 'headline' column
df.dropna(subset=['Headline'], inplace=True)

# Remove rows with 'NaN' values in 'merged_text' and 'headline' columns
df = df[~df['merged_text'].isin(['NaN'])]
df = df[~df['Headline'].isin(['NaN'])]

# Remove rows with errors in 'headline' column
error_headlines = ['403 ERROR', '\n\n', 'Error 403 not available now', 'Page not found', '404', 'PAGE NOT FOUND', '\n']
df = df[~df['Headline'].isin(error_headlines)]

# Print the cleaned dataset
print('The cleaned dataframe has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
df.head(10)


The cleaned dataframe has 26544 rows and 4 columns


Unnamed: 0,Date,Headline,merged_text,id_new
0,2022-08-23,Turkey’s rapprochement with Syria leaves regio...,<p>Syrians across the opposition and in the Ku...,2022-08-23Turkey’srapprochementwithSyrialeaves...
1,2023-01-20,US state department announces new refugee prog...,<p>The US state department has announced a new...,2023-01-20USstatedepartmentannouncesnewrefugee...
2,2023-01-26,Children go hungry at Kenya refugee camp as ma...,<p>Malnutrition among children in one of the w...,2023-01-26ChildrengohungryatKenyarefugeecampas...
3,2022-03-07,"Syria, Afghanistan, Ukraine: how many refugee ...","<p>The province of Van, in the east of Turkey,...","2022-03-07Syria,Afghanistan,Ukraine:howmanyref..."
4,2022-11-17,Firefighter arrested in Germany over refugee s...,<p>A fire that tore through a shelter for Ukra...,2022-11-17FirefighterarrestedinGermanyoverrefu...
5,2023-01-11,Shamima Begum says she understands public ange...,"<p>Shamima Begum, who left Britain to join Isl...",2023-01-11ShamimaBegumsayssheunderstandspublic...
6,2023-02-05,Afghan refugee in London told to give up docto...,<p>A Chevening academic will be forced to give...,2023-02-05AfghanrefugeeinLondontoldtogiveupdoc...
7,2022-11-27,Concern for health of Ukrainians aboard Scotla...,<p>The physical and mental health of Ukrainian...,2022-11-27ConcernforhealthofUkrainiansaboardSc...
8,2022-12-10,Afghan refugee freed in Greece after two years...,<p>An imprisoned Afghan refugee wrongfully acc...,2022-12-10AfghanrefugeefreedinGreeceaftertwoye...
9,2022-12-05,UN refugee body criticises ‘errors’ in asylum ...,<p>A report partially endorsed by the UK home ...,2022-12-05UNrefugeebodycriticises‘errors’inasy...


In [36]:
# Select the rows where the 'Date' column is greater than or equal to '2010-01-01'
df = df.loc[df['Date'] >= '2010-01-01']



### Parsing Corpus

Creating an overview of the most common words to be able to remove scrape output words

In [39]:
# Concatenate all the strings in the 'merged_text' column into a single string
all_text = ' '.join(df['merged_text'].values)

# Use the Counter class from the collections module to count the occurrences of each word
word_counts = Counter(all_text.split())

# Get the 10000 most common words and their counts
most_common = word_counts.most_common(10000)

# Convert the results to a Pandas DataFrame for easier manipulation and printing
most_common_df = pd.DataFrame(most_common, columns=['Word', 'Count'])

# Print the DataFrame with the most common words and their counts in descending order
print(most_common_df.sort_values('Count', ascending=False))

             Word    Count
0             the  1587020
1              to   882290
2              of   778210
3             and   666153
4              in   623796
...           ...      ...
9975       today?      227
9974    outspoken      227
9973          owe      227
9972  demographic      227
9999     favoured      227

[10000 rows x 2 columns]


In [40]:
#most_common_df.to_csv('/content/drive/MyDrive/code_file/preprocessing/syria__most_common_10000.csv')

In [41]:
# Define a function to clean the text
def clean_text(text):
    
    # Remove inline frame element - REMOVE
    text = re.sub(r'<iframe.*?</iframe>', '', text, flags=re.DOTALL)

     # Remove specified text
    patterns = r'\b(p|dataatomidbddcbffdabfcfcfbe|href|blockquote|span|class|element|element-image_caption|element-image_credit|figcaption|liststrong|Photo/Rodrigo|ADVERTISEMENT|element--supporting|Photo/Evgeniy|element-rich-link|copyright|element--showcase|AD-FREE,|meterActive/meterExpired|Photo/Vadim|VideoWhy|/PRNewswire/|itemV1|data-interactive|element-interactive|allowfullscreen="true"|data-atom-type="media"|element-atom|gu-atom|class="twitter-tweet"p|auto-generated|class="timezone"BST/span/time|class="quoted|class="gu-image"|class="/figcaption"|class="timezone"GMT/span/time|class="block"|data-block-contributor=""|class="block-elements"|class="element|href)\b'
    text = re.sub(patterns, '', text)
    
    # Remove <p>
    text = re.sub(r'<p>', '', text)
    text = re.sub(r'</p>', '', text)
    text = re.sub(r'<p/>', '', text)
    text = re.sub(r'/p', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove '@' symbols
    text = re.sub(r'@', '', text)
    
    # Remove links
    text = re.sub(r'http\S+', '', text)
    
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    
   
    return text

In [45]:
# Running clean_text on dataframe and appending the cleaned text to a new clumn parsing_corpus in the dataframe
df['parsing_corpus'] = df['merged_text'].apply(clean_text)

In [46]:
df['parsing_corpus'].head(5)


0    Syrians across the opposition and in the Kurdi...
1    The US state department has announced a new re...
2    Malnutrition among children in one of the worl...
3    The province of Van, in the east of Turkey, is...
4    A fire that tore through a shelter for Ukraini...
Name: parsing_corpus, dtype: object

### Frame Identification Corpus

In [47]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenization(df_col):
    """
    Takes a list with strings and returns a list with tokens
    """
    print("Tokenizing articles...\n")
    return df_col.apply(lambda x: word_tokenize(x))


def lowercase(df_col):
    """
    #Returns all tokens in lowercase.
    """
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])


def only_alphabetic(df_col):
    """
    Keeps only tokens which are alphabetic or an underscore and returns them.
    """
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match("^[a-zA-Z0_]*$", token)])


stop_words = set(stopwords.words('english'))
"""stop_words.update(["refugee","refugees","migrant","migrants","immigrant","immigrants",
                   "like", "would","want","take","must","well","could","even","since",
                   "also","know"])"""

def stopword_removal(df_col):
    """
    Removes all words considered as stopwords and all words that have a length of three or less.
    """
    
    print("Removing Stopwords...\n")

    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])


def lemmatization(df_col):
    """
    Applies lemmatization to all tokens and returns them afterwards.
    """
    
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

def preprocessing(df_col, *steps):
    """
    Takes in a dataframe column with text and applies preprocessing steps given 
    in and returns a string.
    
    Input:
    - df (dataframe): The dataframe containing the text column.
    - steps (functions): Multiple functions for preprocessing can be given in.
    
    Output:
    - List with strings.
    """
    # copying over the column for preprocessing
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join([token for token in x]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [48]:
df["frame_identification_corpus"] = preprocessing(df["parsing_corpus"],
                                            tokenization,
                                            lowercase,
                                            only_alphabetic,
                                            stopword_removal,
                                            lemmatization)

Tokenizing articles...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [56]:
for i,line in df[["merged_text","parsing_corpus","frame_identification_corpus"]].sample(1).iterrows():
    print(line["merged_text"])
    print("---")
    print(line["parsing_corpus"])
    print("---")
    print(line["frame_identification_corpus"])
    print("-----------------------------------")

<p><em>The authors of this article are the current and the three former presidents of the National Coalition of Syrian Revolution and Opposition Forces</em></p> <p>Terror has returned to the streets <a href="http://www.theguardian.com/world/paris-attacks" title="">of Paris</a>. So, too, has the realisation that <a href="http://www.theguardian.com/commentisfree/2015/nov/20/repercussions-syria-collapse-paris-attack-refugee" title="">international efforts</a> to defeat terrorism are failing. <a href="http://www.theguardian.com/world/isis" title="">Islamic State</a> is emboldened. Terror attacks continue. And innocent civilians pay the ultimate price. As the presidents of Syria’s main opposition group — the Syrian National Coalition — we’ve watched the violence in Syria destroy our nation, infest the region, and now, threaten peace and security in Europe. Today we say “enough is enough”. It’s time to face the facts: the haphazard approach to the crisis in Syria has been a disaster. Compreh

### Final Removal of Duplicates 

In [57]:
# Remove duplicates
df.drop_duplicates(subset=['parsing_corpus'], inplace=True)
df.drop_duplicates(subset=['frame_identification_corpus'], inplace=True)

In [58]:
df.count()

Date                           20894
Headline                       20894
merged_text                    20894
id_new                         20894
parsing_corpus                 20894
frame_identification_corpus    20894
dtype: int64

In [59]:
df.to_csv("/content/drive/MyDrive/code_file/preprocessing/syria_complete.csv")