# Connections between MKs 
### **MK = Member of the knesset  (חבר כנסת)
Based on transcripts of the knesset committees.<br/>
The work was done in the 'public knowledge workshop' hackathon and won 3rd place prize.

In [1]:
import pandas as pd
import networkx as nx

rows = []
protocol_rows = []

## Collecting committees texts and analyzing it.
We used a library called fataflows since it is the one used by the 'public knowledge workshop' we worked with.<br/>
The data download is done in parts - each committee text is divided to parts and processed separately.<br/>
Downloading each knesset data and analyzing it took arround 6 hourd per knesset.<br/>
This is why we kept a cache of the downloaded data and saved files of the analyzed data for each knesset.

## Constants

In [4]:
# Limit processing of protocol parts for development, -1 means no limit.
PROCESS_PARTS_LIMIT = 1

# Enable caching of protocol parts data (not efficient, should only be used for local development with sensible PROCESS_PARTS_LIMIT)
PROCESS_PARTS_CACHE = True

# Filter the meetings to be processed, these kwargs are passed along to DataFlows filter_rows processor for meetings resource
MEETINGS_FILTER_ROWS_KWARGS = {'equals': [{'KnessetNum': 20}]}

# Don'e use local data - loads everything from knesset data remote storage
# When set to False - also enables caching, so you won't download from remote storage on 2nd run.
USE_DATA = False

## Load source data

In [37]:
from dataflows import filter_rows, cache
from datapackage_pipelines_knesset.common_flow import load_knesset_data, load_member_names

# Loads a dict containing mapping between knesset member id and the member name
member_names = load_member_names(use_data=USE_DATA)

# define flow steps for loading the source committee meetings data
# the actual loading is done later in the Flow
load_steps = (
    load_knesset_data('people/committees/meeting-attendees/datapackage.json', USE_DATA),
    filter_rows(**MEETINGS_FILTER_ROWS_KWARGS)
)

if not USE_DATA:
    # when loading from URL - enable caching which will skip loading on 2nd run
    load_steps = (cache(*load_steps, cache_path='.cache/people-committee-meeting-attendees-knesset-20'),)

loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/members/mk_individual/datapackage.json
using cache data from .cache/members-mk-individual-names
loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/meeting-attendees/datapackage.json


## Main processing functions

In [38]:
from collections import defaultdict
from dataflows import Flow

stats = defaultdict(int)
member_attended_meetings = defaultdict(int)

def to_dataframe(row):
    rows.append(row)
    
def protocols_to_dataframe(row):
    protocol_rows.append(row)

def process_meeting_protocol_part(row):
    stats['processed parts'] += 1

def process_meeting(row):
    stats['total meetings'] += 1
    if row['attended_mk_individual_ids']:
        for mk_id in row['attended_mk_individual_ids']:
            member_attended_meetings[mk_id] += 1
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (process_meeting_protocol_part,)
            Flow(*steps).process()

process_steps = (to_dataframe,)

## Run the flow

In [39]:
from dataflows import Flow, dump_to_path

Flow(*load_steps, *process_steps, dump_to_path('data/committee-meeting-attendees-parts')).process()

using cache data from .cache/people-committee-meeting-attendees-knesset-20


(<datapackage.package.Package at 0x7f7e459e2e80>,
 {'count_of_rows': 10256,
  'bytes': 30129277,
  'hash': '9bb63d3b4c724c88df1416113d0fb80c',
  'dataset_name': None})

## Get committees data

In [None]:
for index, row in enumerate(rows):
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (protocols_to_dataframe,)
            Flow(*steps).process()
    
    if len(protocol_rows) > 5:
        process_protocol_rows(protocol_rows)
        protocol_rows = []

        index += 1
        if (index + 1) % PROCESS_PARTS_LIMIT == 0:
            break

# Creating a graph of MK connections
An edge from A to B represents that A spoke to B. <br/>
The weight of such edge is the number of times A spoke to B.

In [28]:
G = nx.DiGraph()

def add_edges(speaker, member_set):
    if len(member_set) != 0:
        for member in member_set:
            print ('%s -> %s' % (speaker, member_set))
            add_edge(speaker, member)

def add_edge(speaker, member):
    speaker = speaker[::-1]
    member = member[::-1]
    if G.has_edge(speaker, member):
        G[speaker][member]['weight'] += 1
    else:
        G.add_edge(speaker,member, weight=1)
            
def clean(s):
    if s is None:
        return ''
    
    cleaned_string = remove_titles(s)
    cleaned_string = ''.join([i for i in cleaned_string if i.isalpha() or i.isspace()])
    cleaned_string = cleaned_string.strip()
    return cleaned_string
    
def remove_titles(s):
    return s.replace('היו"ר', '')

def first_non_speaker(speaker, curr_index, protocol_rows):
    i = 0
    former_speaker = speaker
    while speaker in former_speaker:
        i = i + 1
        former_speaker = clean(protocol_rows[curr_index - i]['header'])
        
    return former_speaker

def process_protocol_rows(protocol_rows):
    
    print (protocol_rows[1]['body'])

    indirect_reference = ['אתה', 'לך', 'שאתה', ' שאת ']
    self_reference = ['אני']
    
    for index, row in enumerate(protocol_rows[8:]):

        if row['header'] is None or row['body'] is None:
            print (row)
            continue

        speaker = clean(row['header'])
        spoken_text = clean(row['body'])

        direct_references = set([member for member in member_names.values() if (member in spoken_text)])
        indirect_references = set([first_non_speaker(speaker, index, protocol_rows) for word in indirect_reference if (word in spoken_text)])
        self_reference = set([speaker for word in self_reference if word in spoken_text])

        add_edges(speaker, direct_references)
        add_edges(speaker, indirect_references)
        add_edges(speaker, self_reference)

In [43]:
nx.write_gml(G, './Mk_connections_graph')