In [1]:
import sys  
import os
import pandas as pd
import re

## Helper Functions to Read Text File

code from http://stackabuse.com/read-a-file-line-by-line-in-python/

In [2]:
def order_bag_of_words(bag_of_words, desc=False):  
   words = [(word, cnt) for word, cnt in bag_of_words.items()]
   return sorted(words, key=lambda x: x[1], reverse=desc)


In [3]:
def record_word_cnt(words, bag_of_words):  
   for word in words:
       if word != '':
           if word.lower() in bag_of_words:
               bag_of_words[word.lower()] += 1
           else:
               bag_of_words[word.lower()] = 0

In [4]:
# filepath = sys.argv[1]
def openfile(filepath):
    if not os.path.isfile(filepath):
       print("File path {} does not exist. Exiting...".format(filepath))
       sys.exit()

    return_array = []
        
    with open(filepath) as fp:
       for line in fp:
           return_array.append(line)
    
    return return_array

## Read Text File

In [5]:
lines = openfile('1990-election-results-raw.txt')

First we take a look at the the names of constituencies

In [6]:
for (i,line) in enumerate(lines):
    lines[i] = lines[i].strip()

In [7]:
for (i,line) in enumerate(lines):
    if "cand." in line:
        print("\nline " + str(i-1) + ": " + lines[i-1] + '\n' + ' '.join('{}'.format(k) for k in line.split(";")))


line 0: Kayan-1 (Yangon)
41,226  33,996  31,328  4 cand.

line 4: Kayan-2 (Yangon)
40,018  32,524  28,841  6 cand.

line 8: Wakema-2 (Ayeyarwady)
72,831  49,584  41,528  4 cand.

line 17: Mandalay North East-1 (Mandalay)
37,046  30,833  29,005  8 cand.

line 21: Minbu-1 (Magway)
43,600  31,928  28,506  6 cand.

line 25: Minbu-2 (Magway)
42,102  28,063  24,456  3 cand.

line 30: Mandalay North West-2 (Mandalay)
50,451  38,0943  35,234  6 cand.

line 34: Ngaphe (Magway)
21,337  17,077  15,107  3 cand.

line 38: Mandalay North East-2 (Mandalay)
38,885  30,746  29,401  6 cand.

line 42: Kunyangonn (Yangon)
56,590  47,680  42,287  6 cand.

line 46: Patheingyi-1 (Mandalay)
36,244  27,769  24,662  6 cand.

line 50: Patheingyi-2 (Mandalay)
34,087  27,070  24,014  7 cand.

line 55: Kyaiklat-1 (Ayeyarwady)
45,458  35,056  30,873  4 cand.

line 59: Kyaiklat-2 (Ayeyarwady)
46,027  33,590  29,710  4 cand.

line 63: Pyapon-2 (Ayeyarwady)
52684  34,758  31,864  5 cand.

line 67: Dedaye-2 (Ayeyarwady

## Clean Constituency Names and Total Vote Counts

In [8]:
for (i,line) in enumerate(lines):
    if "cand." in line and "(" not in lines[i-1]:
        print("\nline " + str(i-1) + ": " + lines[i-1] + '\n' + ' '.join('{}'.format(k) for k in line.split(";")))


line 116: Zigon {Bago)
41,957  33,429  28,158  5 cand.

line 125: 35,260; 23,568; [not given, but valid votes tallied equal votes
cast]  5 cand.

line 141: 648 10%
77,887  62,961  57,674  7 cand.

line 404: 3081 13%
42,114  28,353  24,086  4 cand.

line 601: 2547  8%
59,033  36,755  32,815  5 cand.

line 989: 2698 28%
18,651  13,831  13,137  4 cand.

line 1048: 4145 17%
33,672  23,528  19,857  5 cand.

line 1383: 5074 15%
37,458  33,736  31,110  3 cand.

line 1589: 2641 27%
58,576  51,009  45,209  3 cand.

line 1812: 527 10%
33,390  23,765  20,734  4 cand.

line 1848: 2080 10%
56,212  36,124  29,407  7 cand.

line 1955: 3186 24%
48,809  35,172  27,620  7 cand.

line 1997: Mrauk-U-2
44,524  32,026  27,200  6 cand.

line 2024: 2121 11%
1,284  879  806  2 cand.

line 2086: 2221  4%
8,847  5,511  4,678  3 cand.

line 2095: 115  2%
5,572  4,951  4,724  2 cand.


Looks like everything is well formed except lines 116, 125, 141, 404, 601, 989, 1048, 1383, 1589, 1812, 1848, 1955, 1997, 2024, 2086 and 2095

In [9]:
inserted_lines = 0

In [10]:
lines[116] = lines[116].replace("{","()")

In [11]:
print(lines[125])
print(lines[126])

lines[125] = lines[125].strip() + lines[126].strip()
print(lines[125])

new_arr = lines[125].split("; ")

## In this constituency, valid votes equaled to votes cast
new_arr[2] = new_arr[1]
lines[125] = "; ".join(new_arr)
print(lines[125])
print(lines[126])

## Remove the dangling line
lines.pop(126 + inserted_lines)
#inserted_lines -= 1


35,260; 23,568; [not given, but valid votes tallied equal votes
cast]; 5 cand.
35,260; 23,568; [not given, but valid votes tallied equal votescast]; 5 cand.
35,260; 23,568; 23,568; 5 cand.
cast]; 5 cand.


'cast]; 5 cand.'

In [12]:
## For these rows the constituency name is misplaced a few lines above the vote counts instead of right above
lines.insert(141+inserted_lines,"Sagaing-2 (Sagaing)")
inserted_lines += 1
lines.insert(404+inserted_lines,"Thegon-1 (Bago)")
inserted_lines += 1
lines.insert(601+inserted_lines,"Thaton-1 (Mon)")
inserted_lines += 1
lines.insert(989+inserted_lines,"Tiddim-1 (Chin)")
inserted_lines += 1
lines.insert(1048+inserted_lines,"Gyobingauk-2 (Bago)")
inserted_lines += 1
lines.insert(1383+inserted_lines,"Saw (Magway)")
inserted_lines += 1
lines.insert(1589+inserted_lines,"Yekyi-1 (Ayeyarwady)")
inserted_lines += 1
lines.insert(1812+inserted_lines,"Homalin-1 (Sagaing)")
inserted_lines += 1
lines.insert(1848+inserted_lines,"Mogaung (Kachin)")
inserted_lines += 1
lines.insert(1955+inserted_lines,"Kyauktaw-1 (Rakhine)")
inserted_lines += 1
lines[1997+inserted_lines-1] = "Mrauk-U-2 (Rakhine)"
lines.insert(2024+inserted_lines,"Sumprabum (Kachin)")
inserted_lines += 1
lines.insert(2086+inserted_lines,"Lahe (Sagaing)")
inserted_lines += 1
lines.insert(2095+inserted_lines,"Leshi (Sagaing)")

## Put constituency names and vote counts in a dataframe

In [13]:
consti_df = pd.DataFrame(columns=['line','constituency','state','eligible_voters','votes_cast','valid_votes','num_candidates'])

In [14]:
for (i,line) in enumerate(lines):
    line = line.strip()
    if "cand." in line:
        prev_line = lines[i-1].strip()
        [constituency,state] = prev_line.split('(')
        constituency = constituency.strip()
        state = state.replace(')','').strip()
        [eligible_voters,votes_cast,valid_votes,num_candidates] = map(int,line.replace('cand.','') \
                    .replace(',','').replace(' ','').split(";"))
        #print(constituency + ", " + state + ", " + str(eligible_voters) + ", " \
        #      + str(votes_cast) + ", " + str(valid_votes) + ", " + str(num_candidates))
        d = {'line': i, \
             'constituency' : [constituency], \
             'state' : [state], \
             'eligible_voters' : [eligible_voters], \
             'votes_cast' : [votes_cast], \
             'valid_votes' : [valid_votes], \
             'num_candidates' : [num_candidates]}
        consti_df = consti_df.append(pd.DataFrame(data=d))
    
consti_df = consti_df.reset_index()
consti_df.drop(['index'], axis = 1, inplace=True)
consti_df

Unnamed: 0,constituency,eligible_voters,line,num_candidates,state,valid_votes,votes_cast
0,Kayan-1,41226.0,1.0,4.0,Yangon,31328.0,33996.0
1,Kayan-2,40018.0,5.0,6.0,Yangon,28841.0,32524.0
2,Wakema-2,72831.0,9.0,4.0,Ayeyarwady,41528.0,49584.0
3,Mandalay North East-1,37046.0,18.0,8.0,Mandalay,29005.0,30833.0
4,Minbu-1,43600.0,22.0,6.0,Magway,28506.0,31928.0
5,Minbu-2,42102.0,26.0,3.0,Magway,24456.0,28063.0
6,Mandalay North West-2,50451.0,31.0,6.0,Mandalay,35234.0,380943.0
7,Ngaphe,21337.0,35.0,3.0,Magway,15107.0,17077.0
8,Mandalay North East-2,38885.0,39.0,6.0,Mandalay,29401.0,30746.0
9,Kunyangonn,56590.0,43.0,6.0,Yangon,42287.0,47680.0


## Loop through again to add candidate vote counts

In [15]:
def parse_candidates_lines(lines_to_parse):
    return_list = []
    for l in lines_to_parse:
        #l = lines_to_parse[i]
        candidate = ''
        party = ''
        votes = ''
        percent = ''
        
        regex_candidate = r"(.*)\s+\((.*)\)"
        regex_candidate_1 = r"([a-zA-Z- ]+)"
        regex_vote = r"(\d+)\s+(\d+)%"
        
        candidate_capture = re.findall(regex_candidate, l)
        if len(candidate_capture) > 0:
            candidate = candidate_capture[0][0]
            party = candidate_capture[0][1]
        else:
            candidate_capture_1 = re.findall(regex_candidate_1, l)
            if len(candidate_capture_1) > 0 and len(candidate_capture_1[0].strip()):
                candidate = candidate_capture_1[0]
                party = 'Unknown'
            
        
        vote_capture = re.findall(regex_vote, l)
        if len(vote_capture) > 0:
            votes = vote_capture[0][0]
            percent = vote_capture[0][1]
        
        #print('{}, {}, {}, {}'.format(candidate,party,votes,percent))
        return_list.append({'candidate' : candidate.strip(), \
                            'party' : party.strip(), \
                            'votes' : votes.strip(), \
                            'percent' : percent.strip()})
    return return_list
    

In [16]:
def get_structured_candidates_votes(candidates_votes_list,constituency):
    
    candidates = []
    parties = []
    votes = []
    percents = []
    
    rows_to_remove = []
    
    # prioritise parsing complete rows
    for i,l in enumerate(candidates_votes_list):
        if l['candidate'] != '' and l['party'] != '' and l['votes'] != '' and l['percent'] != '':
            candidates.append(l['candidate'])
            parties.append(l['party'])
            votes.append(l['votes'])
            percents.append(l['percent'])
            rows_to_remove.append(i)
    
        
    temp_list = [x for i,x in enumerate(candidates_votes_list) if i not in rows_to_remove] 
    candidates_votes_list = temp_list
    
    # now add the non-complete rows
    for l in candidates_votes_list:
        if l['candidate'] != '':
            candidates.append(l['candidate'])
        if l['party'] != '':
            parties.append(l['party'])
        if l['votes'] != '':
            votes.append(l['votes'])
        if l['percent'] != '':
            percents.append(l['percent'])
            
    maxlength = min([len(candidates),len(parties),len(votes),len(percents)])
    candidates = candidates[0:maxlength]
    parties = parties[0:maxlength]
    votes = votes[0:maxlength]
    percents = percents[0:maxlength]
    constituencies = [constituency]*(maxlength)
    
    to_return = pd.DataFrame(data={'candidate': candidates, 'party' : parties, \
                                   'votes' : votes, 'percent' : percents, 'constituency': constituencies})
    #print(to_return)
    
    return to_return

In [21]:
candidates_df = pd.DataFrame(columns=['constituency','eligible_voters','line', \
                'num_candidates','state','valid_votes','votes_cast','candidate', \
                'party','percent','votes'])

for index, row in consti_df.iterrows():
    #print(index)
    
    # if not at last 2 rows
    if index < consti_df.index.size - 2:
        lines_to_parse = lines[ int(row['line']) + 1 : int(consti_df.iloc[index+1]['line']) - 1 ] 
        #print(lines_to_parse)
        d = parse_candidates_lines(lines_to_parse)
        df_to_append = get_structured_candidates_votes(d,row['constituency'])
        
    # last 2 constituencies results are not well formatted
    elif index == consti_df.index.size - 2:
        lines_to_parse = lines[ int(row['line']) + 1 : int(consti_df.iloc[index+1]['line']) + 4 ]
        #print(lines_to_parse)
        d = parse_candidates_lines(lines_to_parse)
        df_to_append = get_structured_candidates_votes(d,row['constituency'])
    else:
        lines_to_parse = lines[ int(row['line']) + 4 : len(lines) - 1 ]
        #print(lines_to_parse)
        d = parse_candidates_lines(lines_to_parse)
        df_to_append = get_structured_candidates_votes(d,row['constituency'])
     
    temp_df = pd.DataFrame(data = row).transpose()
    temp_df = pd.merge(temp_df,df_to_append,on='constituency',how='inner')
    candidates_df = candidates_df.append(temp_df)
    #print(temp_df)
    
candidates_df

  constituency eligible_voters line num_candidates   state valid_votes  \
0      Kayan-1           41226    1              4  Yangon       31328   
1      Kayan-1           41226    1              4  Yangon       31328   

  votes_cast   candidate party percent  votes  
0      33996  Kyaw Thwin   NLD      68  21278  
1      33996  Than Maung   NUP      27   8408  
  constituency eligible_voters line num_candidates   state valid_votes  \
0      Kayan-2           40018    5              6  Yangon       28841   
1      Kayan-2           40018    5              6  Yangon       28841   

  votes_cast      candidate party percent  votes  
0      32524  Tin Maung Win   NLD      60  17428  
1      32524        Zaw Win   NUP      35  10027  
  constituency eligible_voters line num_candidates       state valid_votes  \
0     Wakema-2           72831    9              4  Ayeyarwady       41528   
1     Wakema-2           72831    9              4  Ayeyarwady       41528   
2     Wakema-2         

Unnamed: 0,constituency,eligible_voters,line,num_candidates,state,valid_votes,votes_cast,candidate,party,percent,votes
0,Kayan-1,41226,1,4,Yangon,31328,33996,Kyaw Thwin,NLD,68,21278
1,Kayan-1,41226,1,4,Yangon,31328,33996,Than Maung,NUP,27,8408
0,Kayan-2,40018,5,6,Yangon,28841,32524,Tin Maung Win,NLD,60,17428
1,Kayan-2,40018,5,6,Yangon,28841,32524,Zaw Win,NUP,35,10027
0,Wakema-2,72831,9,4,Ayeyarwady,41528,49584,Col. Thet Wai,NUP,43,17893
1,Wakema-2,72831,9,4,Ayeyarwady,41528,49584,Myint Swe,LDP,30,12307
2,Wakema-2,72831,9,4,Ayeyarwady,41528,49584,Dr. Saw Bei Htoo,UKL,23,9629
0,Mandalay North East-1,37046,18,8,Mandalay,29005,30833,Maung Maung Aye,NLD,83,24100
1,Mandalay North East-1,37046,18,8,Mandalay,29005,30833,Kyaw Aye,WUO,10,2822
0,Minbu-1,43600,22,6,Magway,28506,31928,Soe Myint,NLD,75,21324


In [22]:
candidates_df.to_csv('1990_election_results.csv')