# Create a machine learning model that will correctly identify politician names and political parties

In [16]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.language import Language

import json
import re

import preprocessor as p
import string
import langdetect
import nltk

import pandas as pd

import requests
from bs4 import BeautifulSoup as bs
import html5lib

## Get data from pdf
import pdfplumber

In [3]:
uhuru_refs = [
    "Uhuru Muigai Kenyatta",
    "Number one",
    "Uhunye",
    "Jayden"
]
raila_refs = [
    "Raila Amolo Odinga",
    "Baba",
    "Rao",
    "Kitendawili",
    
    ]
ruto_refs =[
    "William Samoei Ruto",
    "Willie",
    "Deputy President",
    "DP",
    "Wheelbarrow",
    "Bwana Mashamba",
    "WSR"
]
other_politicians = [
    "Kalonzo Musyoka",
    "Musalia Mudavadi",
    "William Kabogo",
    "Moses Wetangula",
    "Jamali Gaddafi",
    "Roy Smith Mwatia Rufftone",
    "Felix Odiwour Jalango",
    "Jasper Muthomi",
    "MC Jessy",
    "Tabitha Karanja",
    "Anita Soina",
    "Jackson Makini Prezzo",
    "Alex Mwakideu",
    "Tony Kwalanda",
    "Davidson Ngibuini DNG"
    "Mohammed Ali"
    "Johnson Sakaja",
    "Kimani Wamatangi",
    "Irungu Kangata",
    "James Orengo",
    "Susan Kihika",
    "Ledama Olekina",
    "Charles Kibiru",
    "Cleophas Malala",
    "George Khaniri",
    "Mithika Linturi",
    "Mutula Kilonzo Jnr",
    "Kithure Kindiki",
    "Mohamed Mahamud",
    "Sam Ongeri",
    "Fred Outa",
    "Ochillo Ayacko",
    "Steward Madzayo",
    "Margaret Kamar",
    "Kipchumba Murkomen",
    "Okong’o Omogeni",
    "Johnson Sakaja",
    "Anyang Nyong’o",
    "Alfred Mutua",
    "Evans Kidero"
]

In [4]:
political_parties = [
    "ODM",
    "Azimio la Umoja",
    "UDA",
    "Farmers Party",
    "Orange Democratic Movement",
    "Wiper",
    "NARC Kenya",
    "Chama Cha Kazi",
    "Tujibebe Wakenya",
    "Amani National Congress",
    "Ford Kenya",
    "Azimio",
    "Ford"
]

## Get more names and politcal parties from pdf file

### Senate members and parties

In [5]:
def get_names_from_pdf(file):
    pdf = pdfplumber.open(file)
    no_of_pages = len(pdf.pages)
    data = []
    for i in range(no_of_pages):
        page = pdf.pages[i]
        table = page.extract_table()
        for row in table:
            name = row[1]
            if name != "Name":
                # clean the name 
                name = name.split("Sen.")[1].split(",")[0]
                data.append(name)
    return data

In [6]:
def get_political_parties_from_pdf(file):
    pdf = pdfplumber.open(file)
    no_of_pages = len(pdf.pages)
    data = []
    for i in range(no_of_pages):
        page = pdf.pages[i]
        table = page.extract_table()
        for row in table:
            party = row[-1]
            if party != "Political Party":
                data.append(party)
    return data

### Get from National Government File

In [7]:
def get_names_from_pdf_NA(file):
    pdf = pdfplumber.open(file)
    no_of_pages = len(pdf.pages)
    data = []
    for i in range(14):
        page = pdf.pages[i]
        table = page.extract_table()
        for row in table:
            name = row[1]
            if name != "NAME":
                if name == "" or name == None:
                    name = row[2]
                if name != "" and name != None:
                    data.append(name)
    return data

In [8]:
def get_parties_from_pdf_NA(file):
    pdf = pdfplumber.open(file)
    data = []
    page = pdf.pages[14]
    table = page.extract_table()
    for row in table:
        name = row[0]
        if name != "PARTY NAME":
            data.append(name)
            
        name = row[1]
        if name != "PARTY \nABBREVIATION":
            data.append(name)
    return data

### Get from url containing registered political parties

### Code used to get the table for the political parties

page = requests.get("https://victormatara.com/list-of-all-registered-political-parties-in-kenya-2018/")

df_list = pd.read_html(page.text) # this parses all the tables in webpages to a list

df = df_list[0]

df.to_csv("parties.csv")

In [9]:
def get_registered_parties(file):
    data = []
    df = pd.read_csv(file)
    names = df["PARTY NAME"].tolist()
    abbreviations = df["ABBREV"].tolist()
    
    for name in names:
        data.append(name)
        
    for abb in abbreviations:
        data.append(abb)
    return data

In [10]:
#parties from the senate file
senate_parties = get_political_parties_from_pdf("LIST OF SENATE DELEGATIONS AND POLITICAL PARTIES.pdf")
for party in senate_parties:
    political_parties.append(party)
    
#parties from the national_assembly
parties_NA = get_parties_from_pdf_NA("LIST OF MEMBERS OF NA BY PARTY - 25012022.pdf")
parties_NA
for party in parties_NA:
    political_parties.append(party)
    
#parties from registered parties url
registered_parties = get_registered_parties("parties.csv")
for party in registered_parties:
    political_parties.append(party)

political_parties= list(set(political_parties))

In [11]:
#get names from the senate file
senate_names = get_names_from_pdf("LIST OF SENATE DELEGATIONS AND POLITICAL PARTIES.pdf")
for name in senate_names:
    other_politicians.append(name)
    
#get parties from the national assembly file
national_assembly_names = get_names_from_pdf_NA("LIST OF MEMBERS OF NA BY PARTY - 25012022.pdf")
for name in national_assembly_names:
    other_politicians.append(name)
    
other_politicians = list(set(other_politicians))

## Create more names and references for better detection

In [1]:
stopwords = nltk.corpus.stopwords.words("english")
other_stopwords=["one", "President", "MC", "Kenya", "party", "-", "_", "Democratic", "Chama", "National", "la", "Amani, Umoja,", "These", "Confidant"]
for stop in other_stopwords:
    stopwords.append(stop)

NameError: name 'nltk' is not defined

In [13]:
def generate_better_names(refs):
    new_characters = []
    
    #first add the complete names
    for r in refs:
        new_characters.append(r)
        
    #print(len(new_characters))
    #split the names into indvidual names
    for r in refs:
        names = r.split()
        for name in names:
            name = name.strip()
            if name not in stopwords:
                new_characters.append(name)
    #print(len(new_characters))

    titles = ["Dr.", "Professor", "Mr.", "Mrs.", "Ms.", "Miss", "Aunt", "Uncle", "Mr. and Mrs."]
    final_characters = []
    for char in new_characters:
        if char !="":
            final_characters.append(char)
        for title in titles:
            titled_char = f"{title} {char}"
            final_characters.append(titled_char)
    #print(len(final_characters))            
    return final_characters
    

In [14]:
ruto_fnl_refs = generate_better_names(ruto_refs)
uhuru_fnl_refs= generate_better_names(uhuru_refs)
raila_fnl_refs = generate_better_names(raila_refs)
other_politicians_refs = generate_better_names(other_politicians)


## Create training data

In [15]:
def create_training_data(refs, type):
    data = generate_better_names(refs)
    patterns = []
    
    for item in data:
        pattern = {
            "label": type,
            "pattern": item
        }
        patterns.append(pattern)
        
    return patterns


In [16]:
uhuru_patterns = create_training_data(uhuru_fnl_refs, "POLITICIAN")
ruto_patterns = create_training_data(ruto_fnl_refs, "POLITICIAN")
raila_patterns = create_training_data(raila_fnl_refs, "POLITICIAN")
other_politicians_patterns = create_training_data(other_politicians_refs, "POLITICIAN")
poltical_parties_patters = create_training_data(political_parties, "POLITICAL_PARTY")

In [17]:
patterns = [uhuru_patterns, ruto_patterns, raila_patterns, other_politicians_patterns, poltical_parties_patters]

In [18]:
def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler = nlp.add_pipe('entity_ruler')
    for pattern in patterns:
        ruler.add_patterns(pattern)
    nlp.to_disk("political_ner")

In [19]:
generate_rules(patterns)

## Test the model

In [2]:
df1 = pd.read_csv("data/facebook12.csv")
df2 = pd.read_csv("data/facebook_6.csv")
df3= pd.read_csv("data/facebook4.csv")
df4= pd.read_csv("data/facebook2.csv")
df5= pd.read_csv("data/facebook9.csv")
df6= pd.read_csv("data/facebook10.csv")
df7= pd.read_csv("data/facebook16.csv")
df8= pd.read_csv("data/facebook17.csv")
df9= pd.read_csv("data/facebook18.csv")
df10= pd.read_csv("data/facebook19.csv")



In [3]:
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10], axis=0)
df.reset_index()

Unnamed: 0,index,Username,comment
0,0,Mulei Ule Msee,What will happen to kidero
1,1,Daniel David,If only God could hear my prayer then Magwanga...
2,2,Ngiroh Christopher Kipsang,Then you will hear him pointing fingers at Rut...
3,3,Humfred Kabasa Junior,The so called ODM is doing more harm than good...
4,4,Den Nish,"Chinua Achebe says "" a goat is not left to suf..."
...,...,...,...
2781,332,Brayoo Wiseman,Ruto is confused
2782,333,Njeri Anjeline,Stop that nonsense.
2783,334,William Ouma,So Raila wanted to impeach Uhuru so that Ruto ...
2784,335,Zipsy Ndush,Philemon Liech kumbe unaenda muchene hivo


## Clean text

In [22]:
# regular expression to remove additional punctuation marks and other meta data such as breaks
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
def tweet_processing(line):
    # empty list to hold cleaned tweets
        # clean using tweet preprocessor
    clean = p.clean(line)
    # remove punctuations
    text  = "".join([char for char in clean if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    text = REPLACE_NO_SPACE.sub("", text)
    text = REPLACE_WITH_SPACE.sub(" ", text)
    return text


#run the method
df['clean_comments'] = df['comment'].apply(lambda x: tweet_processing(x))


In [24]:
nlp = spacy.load("political_ner")

In [25]:
df["tags"] = df["comment"].apply(lambda x: [(tag.text, tag.label_) for tag in nlp(x).ents])

In [26]:
df[1:50]

Unnamed: 0,Username,comment,clean_comments,tags
1,Daniel David,If only God could hear my prayer then Magwanga...,If only God could hear my prayer then Magwanga...,[]
2,Ngiroh Christopher Kipsang,Then you will hear him pointing fingers at Rut...,Then you will hear him pointing fingers at Rut...,"[(Raila, POLITICIAN)]"
3,Humfred Kabasa Junior,The so called ODM is doing more harm than good...,The so called ODM is doing more harm than good...,"[(ODM, POLITICAL_PARTY), (ODM, POLITICAL_PARTY)]"
4,Den Nish,"Chinua Achebe says "" a goat is not left to suf...",Chinua Achebe says a goat is not left to suff...,"[(OF, POLITICAL_PARTY)]"
5,Philip Chianda Ja Nyamware,Those that are belittling Wanga..that lady is ...,Those that are belittling Wangathat lady is a ...,[]
6,Richard Njuguna,You mean Mr chairman you can't make it to gube...,You mean Mr chairman you cant make it to guber...,[]
7,Fred Okeke,The decision and selection will be made in Kar...,The decision and selection will be made in Kar...,[]
8,Vitz Cherui,Kidero might not be the popular candidate amon...,Kidero might not be the popular candidate amon...,"[(Kidero, POLITICIAN), (Baba, POLITICIAN)]"
9,Omiti Otieno,"As homabain from Kasipul constituency, I'm goi...",As homabain from Kasipul constituency Im going...,"[(William, POLITICIAN), (Ruto, POLITICIAN), (R..."
10,Crispin Nyakweba,"As a resident of Homabay Asumbi, we would rath...",As a resident of Homabay Asumbi we would rathe...,"[(ODM, POLITICAL_PARTY)]"


In [28]:
tags = df["tags"].to_list()
len(tags)

2786

In [29]:
tags = list(filter(None, tags))

In [31]:
len(tags)

916

# CREATE TRAINING DATA

In [15]:
def save_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent = 4)

In [4]:
def test_model(model, text):
    doc = nlp(text)
    results =[]
    entities = []
    
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
        
    if len(entities) >0:
        results = [text, {"entities": entities}]
        return results

In [5]:
nlp = spacy.load("political_ner")

In [11]:
TRAIN_DATA = []
all_comments = df["comment"].to_list()

for comment in all_comments:
    #clean the comments
    comment = p.clean(comment).strip()
    results = test_model(nlp, comment)
    
    # include only entities that have found a result
    if results != None:
        TRAIN_DATA.append(results)

In [13]:
len(TRAIN_DATA)

915

In [18]:
save_data("data/political_training_data.json", TRAIN_DATA)