### India LS Data Pooled

In [1]:
import pandas as pd
import os
import json
import logging
import glob

In [4]:
df_list = []
data_dir = "../data/india/ls/"

for filename in os.listdir(data_dir):
    if filename.endswith(".json"):  # ✅ Process only JSON files
        print(f"Processing: {filename}")
        with open(os.path.join(data_dir, filename), "r") as f:
            try:
                data = json.load(f)  # ✅ Load JSON safely
                if 'membersDtoList' in data:  # ✅ Ensure key exists
                    df = pd.DataFrame(data['membersDtoList'])
                    df['ls'] = filename[3:-5]  # ✅ Extracting identifier from filename
                    df_list.append(df)
                else:
                    print(f"⚠️ Warning: Key 'membersDtoList' not found in {filename}")
            except json.JSONDecodeError as e:
                print(f"❌ Error reading {filename}: {e}")

Processing: ls_17.json
Processing: ls_16.json
Processing: ls_15.json
Processing: ls_18.json
Processing: ls_14.json
Processing: ls_13.json
Processing: ls_12.json


In [17]:
df_pols = pd.concat(df_list, ignore_index=True)

In [18]:
df_pols.shape

(3740, 38)

### Convert to long form to get each email in a separate row

In [27]:
df_pols['mpsno'].nunique()

2291

In [19]:
df_pols[['firstName', 'lastName']].drop_duplicates().shape[0]

2388

In [25]:
df_pols[['firstName', 'lastName', 'dob']].drop_duplicates().shape[0]

2399

In [28]:
df_pols = df_pols.explode('email')
df_pols['email'] = df_pols['email'].str.split(r'</br>')
df_pols = df_pols.explode('email')
df_pols

Unnamed: 0,mpsno,initial,firstName,lastName,gender,partyFname,partySname,stateName,constName,profession,...,qualification,freedom,profession2,categoryCode,ls,mpLastFirstName,mpFirstLastName,maritalStatus,createdAt,updatedAt
0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,Advocate ...,...,Post Graduate,N,...,(SC),17,,,,,
0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,Advocate ...,...,Post Graduate,N,...,(SC),17,,,,,
1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,Businessperson ...,...,Graduate,N,Social Worker ...,(SC),17,,,,,
1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,Businessperson ...,...,Graduate,N,Social Worker ...,(SC),17,,,,,
2,2654,Dr.,Farooq,Abdullah,Male,Jammu and Kashmir National Conference,J&KNC,Jammu and Kashmir ...,Srinagar,Social Worker ...,...,Professional Graduate,N,...,,17,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,3953,Shri,Sita Ram,Yadav,Male,Rashtriya Janata Dal,RJD,Bihar ...,Sitamarhi ...,Agriculturist ...,...,Graduate,N,"Advocate, Social Worker ...",,12,,,,,
3736,3955,Shri,Surendra Prasad,Yadav,Male,Rashtriya Janata Dal,RJD,Bihar ...,Jhanjharpur ...,,...,,,...,,12,,,,,
3737,4002,Dr.,Surendra Prasad,Yadav,Male,Rashtriya Janata Dal,RJD,Bihar ...,Jahanabad ...,,...,,,...,,12,,,,,
3738,533,Shri,Kinjarapu,Yerrannaidu,Male,Telugu Desam Party,TDP,Andhra Pradesh ...,Srikakulam ...,Agriculturist ...,...,Graduate,N,Advocate ...,,12,,,,,


In [29]:
df_pols['email'] = df_pols['email'].str.strip()
df_pols['email_fix'] = df_pols['email'].str.replace('\[at\]', '@', regex=True).str.replace('\[dot\]', '.', regex=True)
df_pols['email_fix'] = df_pols['email_fix'].str.extract(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})')
df_pols['email_fix'].nunique()

2710

### Remove rows with Nan emails (assumptions ~ missing at random or don't use emails)

In [30]:
df_pols = df_pols[~df_pols['email_fix'].isna()]
df_pols.shape

(4683, 39)

In [31]:
df_pols.head()

Unnamed: 0,mpsno,initial,firstName,lastName,gender,partyFname,partySname,stateName,constName,profession,...,freedom,profession2,categoryCode,ls,mpLastFirstName,mpFirstLastName,maritalStatus,createdAt,updatedAt,email_fix
0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,Advocate ...,...,N,...,(SC),17,,,,,,raja.andimuthu@gmail.com
0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,Advocate ...,...,N,...,(SC),17,,,,,,a.raja@sansad.nic.in
1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,Businessperson ...,...,N,Social Worker ...,(SC),17,,,,,,anarayanaswamyanekal5@gmail.com
1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,Businessperson ...,...,N,Social Worker ...,(SC),17,,,,,,a.narayanswamy@sansad.nic.in
2,2654,Dr.,Farooq,Abdullah,Male,Jammu and Kashmir National Conference,J&KNC,Jammu and Kashmir ...,Srinagar,Social Worker ...,...,N,...,,17,,,,,,iamfarooq80@hotmail.com
