In [1]:
import os
import pandas as pd
import numpy as np
import re

In [2]:
TARGET_STATE = "Goa"
TARGET_YEARS = [1967, 2012]
block_size = 8

In [None]:
yearwise_dfs = {}

In [32]:
for year in TARGET_YEARS:
    
    csv_files_folder = os.path.join("states", TARGET_STATE, f"{year}", f"tabula-{year}_ConstituencyData")


    csv_files = [i for i in os.listdir(csv_files_folder) if(i.endswith(".csv"))]
    file_df = pd.DataFrame({"file" : csv_files})
    file_df['file_num'] = file_df['file'].str.split('-').str[-1].str.split(".").str[0].astype("int")
    file_df = file_df.sort_values('file_num')
#     display(file_df)

    block_files_labelled = {
        0 : "Constituency_Name",
        1 : "Candidates_Info",
        2 : "Electors_Info",
        3 : "Voters_Info",
        4 : "Votes_Info",
        5 : "Polling_Stations",
        6 : "Dates",
        7 : "Result"
    }

    constituency_wise_data = {}



    for i in range(0, len(csv_files), block_size):
        tar_files = file_df['file'].to_list()[i : i+block_size]

        tmp_dfs = {
            idx : pd.read_csv(os.path.join(csv_files_folder, file))
                       for idx,file in enumerate(tar_files)
        }

        const_found = False
        current_constituency = 0

        n_genders = 3 # including 
        for idx, df in tmp_dfs.items():

            if(idx == 0):
                tar_val = ' '.join(df.columns)
                tar_obj = re.match(".+\s*:+?\s*(\d+)\s*- ([A-Za-z() ]+)", tar_val)
                if(tar_obj):
                    const_found = True
                    const_num = int(tar_obj[1])
                    current_constituency = const_num
                    constituency_wise_data[const_num] = {}
                    constituency_wise_data[const_num]['Constituency_Name'] = tar_obj[2]



            else:
                n_df = df.copy()

                # Polling Dates
                if(idx == 6):
                    n_df.columns = n_df.iloc[0]
                    n_df = n_df.drop([0], axis = 'index')
                    n_df.columns = n_df.columns.fillna("k")

                # Drop empty columns, if any
                empty_cols = [(col,empty) for col,empty in n_df.isna().all().items()]
                for (n_col, n_empty) in empty_cols:
                    if(n_col in ['MALE', 'FEMALE', 'TOTAL']):
                        continue
                    
                    if(n_empty):
                        n_df = n_df.drop(n_col, axis = 'columns')

                    elif(idx == 1):
                        if(n_df.iloc[:,0].value_counts() == 1).all:
                            if(len(n_df.columns)>5):
                                n_df = n_df.drop('Unnamed: 0', axis = 'columns')



                if(idx == 7):
                    n_df['VII. RESULT'] = n_df['VII. RESULT'].fillna(0)
                    n_df.columns = n_df.iloc[0]
                    n_df = n_df.drop([0], axis = 'rows')
                    test = n_df.set_index(0).T.to_dict()
                    constituency_wise_data[current_constituency][block_files_labelled[idx]] = test

                elif(idx == 5):
                    n_df = n_df.iloc[0].to_string()
                    num = re.findall(r'NUMBER\s+(\d+)\s+',n_df)[0]
                    count = re.findall(r'PER POLLING STATION\D+\d?\D*(\d{2,}.?\d?)\s*',n_df)[0]
                    test = {'STATION NUMBER':{'VALUE':num},'ELECTORS PER STATION':{'VALUE':count}}
                    constituency_wise_data[current_constituency][block_files_labelled[idx]] = test


                elif(idx == 6):
                    n_df['Dates'] = ['Dates']
                    test = n_df.set_index('Dates').T.to_dict()
                    constituency_wise_data[current_constituency][block_files_labelled[idx]] = test

                else:

                    variable_name = n_df.columns[0]
                    for col in n_df.columns:
                        if(col == variable_name):
                            continue

                        n_df[col] = n_df[col]


                    n_df['TAR_VARIABLE'] = n_df[variable_name].str.extract("\d+\.\s+([A-Z ]+)")
                    n_df = n_df.drop(variable_name, axis = 'columns')


                    n_df = n_df.fillna(0)
                    if(idx == 1):
                        if((n_df['TAR_VARIABLE'] == 0).all()):
                            n_df = n_df.drop(0, axis='rows')
                        
                        print(idx, n_df.columns)
                        n_genders = sum([i for i in n_df.columns if(i in ['MEN', 'WOMEN', 'Others'])])
                    
                    # Columns with gender-wise breakdown of variables
                    if(idx in [1,2,3]):
                        if(n_genders == 3):
                            n_df.columns = ['MEN', 'WOMEN', 'OTHERS', 'TOTAL', 'TAR_VARIABLE']
                            
                        else:
                            print(n_genders)
                            display(n_df)
                            n_df.columns = ['MEN', 'WOMEN', 'TOTAL', 'TAR_VARIABLE']

                    # Columns with only constituency-wise aggregates
                    else:
                        n_df.columns = ['VALUE', 'TAR_VARIABLE']

                    test = n_df.set_index('TAR_VARIABLE').T.to_dict()
                    constituency_wise_data[current_constituency][block_files_labelled[idx]] = test
                    
                    
        
    # Convert to dataframe
    state_df = pd.DataFrame()
    state1_df = pd.DataFrame()
    for const_num in constituency_wise_data.keys():

        const_df = pd.DataFrame({"Constituency_No" : [const_num]})

        for k,v in constituency_wise_data[const_num]['Candidates_Info'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            const_df = pd.concat([const_df, x_df], axis = 1)

        elect_df = pd.DataFrame({"Constituency_No" : [const_num]})
        for k,v in constituency_wise_data[const_num]['Electors_Info'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            elect_df = pd.concat([elect_df, x_df], axis = 1)

        voter_df = pd.DataFrame({"Constituency_No" : [const_num]})
        for k,v in constituency_wise_data[const_num]['Voters_Info'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            voter_df = pd.concat([voter_df, x_df], axis = 1)

        votes_df = pd.DataFrame({"Constituency_No" : [const_num]})
        for k,v in constituency_wise_data[const_num]['Votes_Info'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            votes_df = pd.concat([votes_df, x_df], axis = 1)

        polling_df = pd.DataFrame({"Constituency_No" : [const_num]})
        for k,v in constituency_wise_data[const_num]['Polling_Stations'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            polling_df = pd.concat([polling_df, x_df], axis = 1)

        dates_df = pd.DataFrame({"Constituency_No" : [const_num]})
        for k,v in constituency_wise_data[const_num]['Dates'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            dates_df = pd.concat([dates_df, x_df], axis = 1)

        result_df = pd.DataFrame({"Constituency_No" : [const_num]})
        for k,v in constituency_wise_data[const_num]['Result'].items():
            cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
            x_df = pd.DataFrame(cols, index = [0])
            result_df = pd.concat([result_df, x_df], axis = 1)

        const_df = pd.merge(const_df, elect_df, on = "Constituency_No")
        const_df = pd.merge(const_df, voter_df, on = "Constituency_No")
        const_df = pd.merge(const_df, votes_df, on = "Constituency_No")
        const_df = pd.merge(const_df, polling_df, on = "Constituency_No")
        const_df = pd.merge(const_df, dates_df, on = "Constituency_No")
        const_df = pd.merge(const_df, result_df, on = "Constituency_No")
        #display(state_df)
        state_df = pd.concat([state_df, const_df])
        
    yearwise_dfs[year] = state_df.copy()

1 Index(['Unnamed: 1', 'MALE', 'FEMALE', 'TOTAL', 'TAR_VARIABLE'], dtype='object')
0


Unnamed: 0,Unnamed: 1,MALE,FEMALE,TOTAL,TAR_VARIABLE
1,1. NOMINATED,0.0,0.0,0.0,0
2,2. REJECTED,0.0,0.0,0.0,0
3,3. WITHDRAWN,0.0,0.0,0.0,0
4,4. CONTESTED,0.0,0.0,0.0,0
5,5. FORFEITED DEPOSIT,0.0,0.0,0.0,0


ValueError: Length mismatch: Expected axis has 5 elements, new values have 4 elements

In [131]:
state_df.iloc[:,53:]

Unnamed: 0,TENDERED VOTES_VALUE,STATION NUMBER_VALUE,ELECTORS PER STATION_VALUE,Dates_POLLING,Dates_COUNTING,Dates_DECLARATION OF RESULT,WINNER_PARTY,WINNER_CANDIDATE,WINNER_VOTES,RUNNER-UP_PARTY,RUNNER-UP_CANDIDATE,RUNNER-UP_VOTES,MARGIN_PARTY,MARGIN_CANDIDATE,MARGIN_VOTES
0,1.0,46,638.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Laxmikant Parsekar,11955,INC,Dayanand Raghunath Sopte,8520,3435,( 13.25%of Total Valid Votes),
0,0.0,48,615.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Rajendra Arlekar,16406,INC,Manohar Trimbak Ajgaoankar,8053,8353,( 32.26%of Total Valid Votes),
0,0.0,41,583.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,IND,Naresh Rajaram Sawal,8331,INC,Rajesh T. Patnekar,6532,1799,( 8.57%of Total Valid Votes),
0,0.0,38,621.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Kiran Mohan Kandolkar,10473,NCP,Nilkanth Ramnath Halarnkar,9361,1112,( 5.47%of Total Valid Votes),
0,1.0,42,611.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Francis D'souza,14955,NCP,Ashish Tulshidas Shirodkar,4786,10169,( 50.32%of Total Valid Votes),
0,1.0,44,599.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Dayanand Mandrekar,11430,INC,Uday Dattaram Paliemkar,9259,2171,( 10.03%of Total Valid Votes),
0,1.0,39,610.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Dilip Parulekar,10084,IND,D'souza Tulio,4276,5808,( 29.92%of Total Valid Votes),
0,1.0,33,672.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Michael Vincent Lobo,9891,INC,Agnelo Nicholas Fernandes,8022,1869,( 10.19%of Total Valid Votes),
0,1.0,38,558.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,IND,Rohan Khaunte,7972,BJP,Govind Parvatkar,7071,901,( 5.42%of Total Valid Votes),
0,0.0,41,613.0,03-Mar-2012,6-Mar-2012,6-Mar-2012,BJP,Ticlo Glenn J V A E Souza,11315,INC,Dayanand G. Narvekar,7839,3476,( 17.13%of Total Valid Votes),


In [20]:
# state_df = pd.DataFrame()
# state1_df = pd.DataFrame()
# const_dfs = []
# for const_num in constituency_wise_data.keys():

#     const_df = pd.DataFrame({"Constituency_No" : [const_num]})
    
#     for key,item in block_files_labelled.items():
#         if(key in [0,5]):
#             pass
#         else:
#             values_df = pd.DataFrame({"Constituency_No" : [const_num]})
#             #print(block_files_labelled[key])
#             for k,v in constituency_wise_data[const_num][block_files_labelled[key]].items():
#                 cols = {f"{k}_{subsection}" : value for subsection, value in v.items()}
#                 x_df = pd.DataFrame(cols, index = [0])
#                 const_df = pd.concat([const_df, x_df], axis = 1)
#             const_df = pd.merge(const_df, values_df, on = "Constituency_No")
#             #display(const_df)
#     const_dfs.append(const_df)
#     #state_df = pd.concat([state_df, const_df])
#     #display(const_df)
#     #display(state2_df.drop('index',axis=1))
#     #state2_df = pd.concat([state2_df, const_df], axis=0).reset_index(drop=True)
#     #print('state')
#     #display(state_df)
#     #state_df = const_df