# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import bz2
import json
import time
import pandas as pd
from pprint import pprint

In [3]:
LINE_CHECKPOINT = 1e6
ORIGINAL_DATASETS_PATH = '/content/drive/MyDrive/Quotebank/'
SHRANK_DATASETS_PATH = '/content/drive/MyDrive/quotebank_cleaned/'
PARQUET_PATH = '/content/drive/MyDrive/speaker_attributes.parquet'

# Quotebank Data

## Example of Data
Pretty printing a single entry as reference.

In [4]:
with bz2.open('/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2', 'rb') as t_file:
    pprint(json.loads(t_file.readline()))

{'date': '2020-01-28 08:04:05',
 'numOccurrences': 1,
 'phase': 'E',
 'probas': [['None', '0.7272'],
            ['Prime Minister Netanyahu', '0.2445'],
            ['Natan Sharansky', '0.0283']],
 'qids': [],
 'quotation': '[ D ] espite the efforts of the partners to create a '
              'non-political award that unites the Jewish people, some have '
              'incorrectly interpreted the participation of the Office of the '
              'Prime Minister in the Genesis Prize as bringing a political '
              'dimension to this important initiative,',
 'quoteID': '2020-01-28-000082',
 'speaker': 'None',
 'urls': ['http://israelnationalnews.com/News/News.aspx/275210']}


## Making Data Manageable
The provided datasets contain a lot of information that is not relevant to our goal, therefore we have decided to perform a first round of pruning.
We removed all quotations that were not assigned to any speaker, and stripped the remaining ones of `date` and `numOccurrences` fields.
With the shrinked versions of the yearly datasets we are able to perform all following operations more swiftly.

In [5]:
def shrink_archive(source_path, destination_path):
    """Shrink the bz2 archive supplied as source_path by removing data unnecessary for the project.
    Unnecessary data includes:
        - all entries where the speaker has not been identified
        - date fields. The date part can be retrieved from the ID if needed.
        - numOccurrences field"""
    print("Shrinking {} to {}".format(source_path, destination_path))
    with bz2.open(source_path, 'rb') as s_file:
        with bz2.open(destination_path, 'wb') as d_file:
            time_start = time.time()
            for i, s_line in enumerate(s_file):
                quote_dict = json.loads(s_line) # Load current line into py dict
                if quote_dict['speaker'] != 'None':
                    # If a speaker is present, shrink and save the dictionary
                    # Remove date, can be retrieved from id if needed
                    del quote_dict['date']
                    del quote_dict['numOccurrences'] # Remove numOccurrances, not needed
                    # writing in the new file
                    d_file.write((json.dumps(quote_dict)+'\n').encode('utf-8'))
                if i % LINE_CHECKPOINT == 0: print("On line {}...".format(i)) # Visual feedback
    print("--- %s seconds ---" % (time.time() - time_start))
    print("Done, processed {} lines.".format(i))

In [None]:
for i in range(2015, 2020):
    path_to_file_i = '/content/drive/MyDrive/Quotebank/quotes-{}.json.bz2'.format(i)
    path_to_file_shrank = '/content/drive/MyDrive/quotebank_cleaned/quotes-{}-shrank.json.bz2'.format(i)
    shrink_archive(path_to_file_i, path_to_file_shrank)

## Benchmarking
The following snippet shows how much time we are able to save when operating with reduced versions of the datasets.

In [6]:
def stopwatch_dataset(source_path):
    """Utility function created to roughly estimate time needed to cycle through a provided dataset"""
    print("Stop-watching ", source_path)
    with bz2.open(source_path, 'rb') as s_file:
        time_start = time.time()
        for i, s_line in enumerate(s_file):
            if i % LINE_CHECKPOINT == 0: print("On line {}...".format(i)) # Visual feedback
    print("--- %s seconds ---" % (time.time() - time_start))

In [7]:
test_year = 2020
stopwatch_dataset('{}quotes-{}.json.bz2'.format(ORIGINAL_DATASETS_PATH, test_year))
stopwatch_dataset('{}quotes-{}-shrank.json.bz2'.format(SHRANK_DATASETS_PATH, test_year))

Stop-watching  /content/drive/MyDrive/Quotebank/quotes-2020.json.bz2
On line 0...
On line 1000000...
On line 2000000...
On line 3000000...
On line 4000000...
On line 5000000...
--- 209.15723657608032 seconds ---
Stop-watching  /content/drive/MyDrive/quotebank_cleaned/quotes-2020-shrank.json.bz2
On line 0...
On line 1000000...
On line 2000000...
On line 3000000...
--- 134.8744819164276 seconds ---


## Utility Method

We've created an utility method to access the shrank version of datasets.

In [8]:
def load_shrank_dataset(year):
    year = int(year)
    if year < 2015 or year > 2020: raise ValueError("Provide a year between 2015 and 2020.")
    # Remember to close the file stream!
    return bz2.open('{}quotes-{}-shrank.json.bz2'.format(SHRANK_DATASETS_PATH, year), 'rb')

# Wikidata

In [9]:
wiki_df = pd.read_parquet(PARQUET_PATH)

## Data Inspection
Here is a quick exemple on how to search/filter for a specific politician, knowing boxes contain lists, not only strings.

In [10]:
# Searching for a specific politician. Just replace the name here. Note : it is case sensitive by default. 
# Use either what is present in the aliases field or the label field. 

mask = wiki_df["label"].str.contains("Bush", regex = False, na = False)

# Change number of desired results here 
wiki_df[mask][:10]

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
11351,[Vanevar Bush],[+1890-03-11T00:00:00Z],[Q30],[Q6581097],1392187290,,,"[Q82594, Q205375, Q82955, Q1622272, Q81096]",,,Q299595,Vannevar Bush,,item,[Q55004488]
12327,[Prescott Sheldon Bush],[+1895-05-15T00:00:00Z],[Q30],[Q6581097],1393428512,[Q7435494],B001167,"[Q82955, Q806798, Q4416090]",[Q29468],,Q324742,Prescott Bush,,item,[Q682443]
16876,,[+1955-12-24T00:00:00Z],[Q30],[Q6581097],1388311017,[Q49085],,"[Q10800557, Q10798782]",,,Q452552,Grand L. Bush,,item,
17154,,[+1984-06-25T00:00:00Z],[Q30],[Q6581072],1392040371,,,"[Q4610556, Q33999, Q3501317]",,,Q456169,Lauren Bush,,item,
20670,,[+1970-09-20T00:00:00Z],[Q30],[Q6581072],1391193823,[Q49085],,"[Q33999, Q10800557, Q10798782]",,,Q536025,N'Bushe Wright,,item,
44750,"[Marvin Bush, Marvin Pierce Bush]",[+1956-10-22T00:00:00Z],[Q30],[Q6581097],1311310452,,,[Q43845],,,Q1375345,Marvin P. Bush,,item,
44772,[Samuel Prescott Bush],[+1863-10-04T00:00:00Z],[Q30],[Q6581097],1373387625,,,"[Q43845, Q131524]",,,Q1376227,Samuel P. Bush,,item,
45548,,[+1869-01-20T00:00:00Z],[Q30],[Q6581097],1323621663,,,"[Q1622272, Q2504617]",,,Q1394511,Albert Bushnell Johnson,,item,
52576,,[+1921-12-16T00:00:00Z],[Q183],[Q6581097],1329921163,,,"[Q9385011, Q1622272]",,,Q1592275,Karl-August Bushe,,item,


## Summary : available fields and infos 
- Each person is identified by an (unique? check if any duplicate) id (column id), matching the one on wikipedia
- Label contains one specific label for this person, as it appears in wikipedia's url. 
- Aliases contain the most used names to refer to this person. Can be empty if no other than the one indicated in labels. 
- Infos are either none, or the reference of a QID article. Mostly, these references are created when a link to a referenced wikipedia elements is added in a person's wikipedia page. 
- Occupation QIDs can be relevant for us. Sample QIDs are: Q82955 - Politician, Q189290 - Military officier, Q39631 - Physician (medecine), Q30461 - President... 
- Political party is also given, if any. For congresspersons, their corresponding ID is also given. 
- No time distinction is available, i.e. if a person changed its political party (or any other attribute available here) during his life, both political parties will be indicated. 

## Issues : missing data
- Some data are clearly missing above. George W. Bush may not be the smartest person alive, but he still holds a couple of academic degrees. For some people the degree is simply indicated as text (and not a hyperlink) in the wikipedia page, but it's not the case for all. George's biography clearly states a Bachelor of Arts as a hyperlink although we can not find it in the data set. It may be a timing issue.  
- In consequence, let's inspect how the fields of interest for us will be impacted by missing data. 

First, we will start by removing people born before 1900 from our dataset. There are 3 main reasons : 
- They do not truly impact how biased a news source is in reporting current events, even though we could expect conservative newspapers to cite more secessionist generals and left-wing newspapers to cite more Martin Luther King. 
- American political parties used to switch from left to right (fun fact : Lincoln was a Republican, left-wing at that time) and there were a few major older parties that disapeared today. 
- Completeness of the dataset for people born long time ago is expected to be low : there were no "political parties" at the time of Cesar. 

In [11]:
# convert the array of strings to a integer year. We keep only the year, for those born after 1900 
def filter_date_1900(x):
    if x is not None:
        # we don't care about those born in before J.C (- as first character) or before 1900
        if (x[0])[0] == "+" and int((x[0])[1:5]) >= 1900 :
            return int((x[0])[1:5])
        else: 
            return None

wiki_df["date_of_birth"] = wiki_df["date_of_birth"].apply(lambda x: filter_date_1900(x))
# we discard the na value. Note that this could also discard a few people that simply 
# did not had their date of birth in wikidata, but may be born afterwards. 

wiki_df = wiki_df.dropna(subset = ["date_of_birth"])
wiki_df.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",1952.0,[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",1946.0,[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
5,"[Augusto Pinochet Ugarte, Augusto José Ramón P...",1915.0,[Q298],[Q6581097],1392242213,,,"[Q189290, Q82955]",[Q327591],,Q368,Augusto Pinochet,,item,[Q1841]
8,"[Neil Percival Young, Shakey, Godfather of Gru...",1945.0,"[Q16, Q30]",[Q6581097],1395459626,,,"[Q177220, Q488205, Q2526255, Q639669, Q1881462...",,,Q633,Neil Young,,item,
9,,1969.0,[Q183],[Q6581097],1340253739,,,"[Q33231, Q41546637]",,,Q640,Harald Krichel,,item,


In [12]:
# Proportion of no academic degree or missing academic degrees. 
no_academic_found = wiki_df[wiki_df["academic_degree"].isnull()].size 
print("The proportion of people with no academic degree found is {0:.1%}".format(no_academic_found/wiki_df.size))

The proportion of people with no academic degree found is 98.1%


In [13]:
# Proportion of missing occupations. 

no_occupation_found = wiki_df[wiki_df["occupation"].isnull()].size 
print("The proportion of people with no occupation found is {0:.1%}".format(no_occupation_found/wiki_df.size))

The proportion of people with no occupation found is 16.6%


There is clearly some issue with the academic degrees, we may not want to use this field. In terms of occupation, "only" 16% are missing. Here is a small exemple of such cases. 

In [14]:
wiki_df[wiki_df["occupation"].isnull()].head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
72,,1919.0,,[Q6581097],1390158852,,,,,,Q4291,Ante Bilobrk,,item,
545,,1958.0,,[Q6581097],1309589591,[Q127885],,,,,Q29830,Vukašin Šoškoćanin,,item,
604,"[Greg DePalma, Gregory J. DePalma]",1932.0,,[Q6581097],1309589720,,,,,,Q33371,Gregory DePalma,,item,
1030,,1924.0,,[Q6581097],1390159368,,,,,,Q53595,Pranas Brazinskas,,item,
2289,,1922.0,[Q15180],[Q6581097],1327065855,,,,[Q79854],,Q73926,Oleg Goncharenko,,item,


Now, let's check for integrity in terms of political parties

In [15]:
have_political_party = wiki_df[wiki_df["party"].isnull()].size
print("The proportion of people with a political party, out of the total population found is", 1-have_political_party/wiki_df.size)

The proportion of people with a political party, out of the total population found is 0.0794478078133456


Now we want to see how many politicians have a political party. First we have to find a right format for the occupation method. To do so, we will convert the array of strings in the "occupation" column to a list of strings. 

In [16]:
# converts an array of strings to one string 
def array_to_string(s):
    if s is not None:
        concat_string = "" 
        for elem in s: 
            concat_string += elem + ","
        return concat_string 
    else: 
        return None

In [17]:
wiki_df["occupation_str"] = wiki_df["occupation"].apply(lambda x: array_to_string(x))

In [18]:
# we of course don't forget to set na values to a default "False"
# Q82955 designates politicians
are_politicians = wiki_df[wiki_df["occupation_str"].str.contains("Q82955", na = False)].size
print("The proportion of politicians found is {0:.1%}".format(are_politicians/wiki_df.size))

The proportion of politicians found is 9.7%


In [19]:
have_political_party = wiki_df[wiki_df["occupation_str"].str.contains("Q82955", na = False) & wiki_df["party"].isnull()].size 
print("Among the politicians, the part having a political party is {0:.1%}".format(1-have_political_party/wiki_df.size))

Among the politicians, the part having a political party is 96.3%


This proportion is big enough for us to draw valid conclusions on our set. Similar proportions can be computed easily for other occupations. 

## Extraction

We will create a dictionary containing the IDs and political parties for all persons having a political party. Note we only care about persons born after 1900. Some people could have more than one party. 

In [20]:
dic = wiki_df[wiki_df["party"].notna()].set_index("id")["party"].to_dict()

# Further Data Inspection
Since our end goal will rely on which political party people that get quoted adhere to, let's analyze what properties the data has in this regard.

Firstly, let's take a sample dataset (2020 shrank) and see how many people in it have a political party. For now we will analyze only those quotations that only have a single QID associated to them.

In [21]:
# We want to check how many of the single-QID quotations belong to a people with political parties
num_single_qid = 0
num_politicians = 0
with load_shrank_dataset(2020) as input_file:
    for i, input_line in enumerate(input_file):
        qids = json.loads(input_line)['qids']
        if len(qids) == 1:
            num_single_qid += 1
            if qids[0] in dic.keys(): num_politicians += 1
        # if i > 1e4: break
print("Processed {} quotations.".format(i))
print("Out of {} quotations that have only one associated QID, {} are from persons labeled with a political party.".format(num_single_qid, num_politicians))
print("Rate: {0:.1%}".format(num_politicians/num_single_qid))

Processed 3443604 quotations.
Out of 2511905 quotations that have only one associated QID, 789714 are from persons labeled with a political party.
Rate: 31.4%


Quotations containing a list of QIDs won't be analyzed for now, because they might require some form of manual checking. We will dive deeper into how we can utilize them only if single-QIDs quotations don't provide enough information.
One possible way to save some multi-QID quotations would be checking if all the associated QIDs adhere to the same political party. This hypothesis seems very unlikely so it will be developed only in case of need.

# Conclusion

With this initial pipeline in place, we have laid the foundation from which we will address how polarized news sources are. The shrank versions of the datasets allow us to perform large scale operations almost twice as fast, but also act as a starting point for further splitting.

We have yet to decide which news sources will undergo the analysis process, so we have refrained from breaking down further our data sets. In case such operation will be needed, it will be eased thanks to the tools constructed in this pipeline.