### How Many Indian Politician Accounts Have Been Hacked?

In [1]:
import pandas as pd
import os
import json
os.chdir("/home/jupyter/notebooks/pwned_pols/")

### Concat HIBP Data

Ignore the empty files. We will impute 0s for those emails. 

In [2]:
json_files = os.listdir('pwned')
df_list = []

for json_file in json_files:
    if not json_file.endswith('.json'):
        continue
    with open(f'pwned/{json_file}') as f:
        try:
            data = json.load(f)
            df_list.append(pd.DataFrame({'email': json_file[:-5], 'data': data}))
        except json.JSONDecodeError:
            data = None

df_hibp = pd.concat(df_list)

In [3]:
df_hibp_sum = df_hibp.groupby('email').size().rename('count', inplace=True).reset_index()
df_hibp_sum

Unnamed: 0,email,count
0,38ashokroad@gmail.com,5
1,97gambhirgautam@gmail.com,1
2,J123uk@gmail.com,2
3,KULDEEPSINGHBISHNOI@gmail.com,7
4,,8
...,...,...
936,yashvirsansad@gmail.com,1
937,yashwant.singh19@sansad.nic.in,1
938,ys.avinash@gmail.com,2
939,ysinha2005@hotmail.com,5


### Get all pols

In [4]:
df_list = []

for filename in os.listdir("../daughters/data/"):
    print(filename)
    with open(os.path.join("../daughters/data/", filename), "r") as f:
            data = json.load(f)
    df = pd.DataFrame(data['membersDtoList'])
    df['ls'] = filename[3:-5]
    df_list.append(df)

df_pols = pd.concat(df_list)
df_pols = df_pols.reset_index()
df_pols.shape

ls_17.json
ls_16.json
ls_13.json
ls_14.json
ls_15.json
ls_12.json


(3196, 34)

## Convert to long form to get each email in a separate row

In [5]:
df_pols = df_pols.explode('email')
df_pols['email'] = df_pols['email'].str.split(r'</br>')
df_pols = df_pols.explode('email')
df_pols

Unnamed: 0,index,mpsno,initial,firstName,lastName,gender,partyFname,partySname,stateName,constName,...,imageUrl,profileUrl,dob,numberOfSons,numberOfDaughters,qualification,freedom,profession2,categoryCode,ls
0,0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,...,https://sansad.in/getFile/mpimage/photo/344.jp...,,10/05/1963,0.0,1.0,Post Graduate,N,...,(SC),17
0,0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,...,https://sansad.in/getFile/mpimage/photo/344.jp...,,10/05/1963,0.0,1.0,Post Graduate,N,...,(SC),17
1,1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,...,https://sansad.in/getFile/mpimage/photo/5175.j...,,16/05/1957,0.0,3.0,Graduate,N,Social Worker ...,(SC),17
1,1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,...,https://sansad.in/getFile/mpimage/photo/5175.j...,,16/05/1957,0.0,3.0,Graduate,N,Social Worker ...,(SC),17
2,2,2654,Dr.,Farooq,Abdullah,Male,Jammu and Kashmir National Conference,J&KNC,Jammu and Kashmir ...,Srinagar,...,https://sansad.in/getFile/mpimage/photo/2654.j...,,21/10/1937,1.0,3.0,Professional Graduate,N,...,,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3191,524,3953,Shri,Sita Ram,Yadav,Male,Rashtriya Janata Dal,RJD,Bihar ...,Sitamarhi ...,...,https://sansad.in/getFile/mpimage/photo/3953.j...,,05/01/1946,3.0,2.0,Graduate,N,"Advocate, Social Worker ...",,12
3192,525,3955,Shri,Surendra Prasad,Yadav,Male,Rashtriya Janata Dal,RJD,Bihar ...,Jhanjharpur ...,...,https://sansad.in/getFile/writereaddata/biodat...,https://lssapi.nic.in/MemberProfile/biodata_1_...,,,,,,...,,12
3193,526,4002,Dr.,Surendra Prasad,Yadav,Male,Rashtriya Janata Dal,RJD,Bihar ...,Jahanabad ...,...,https://sansad.in/getFile/writereaddata/biodat...,https://lssapi.nic.in/MemberProfile/biodata_1_...,,,,,,...,,12
3194,527,533,Shri,Kinjarapu,Yerrannaidu,Male,Telugu Desam Party,TDP,Andhra Pradesh ...,Srikakulam ...,...,https://sansad.in/getFile/mpimage/photo/533.jp...,,23/02/1957,1.0,1.0,Graduate,N,Advocate ...,,12


In [6]:
df_pols['email'] = df_pols['email'].str.strip()
df_pols['email_fix'] = df_pols['email'].str.replace('\[at\]', '@', regex=True).str.replace('\[dot\]', '.', regex=True)
df_pols['email_fix'] = df_pols['email_fix'].str.extract(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})')

## Remove rows with Nan emails (assumptions ~ missing at random or don't use emails)

In [7]:
df_pols = df_pols[~df_pols['email_fix'].isna()]

### Join to pol data

In [8]:
df_joined = df_pols.merge(df_hibp_sum, left_on='email_fix', right_on = 'email', how='left')
df_joined

Unnamed: 0,index,mpsno,initial,firstName,lastName,gender,partyFname,partySname,stateName,constName,...,numberOfSons,numberOfDaughters,qualification,freedom,profession2,categoryCode,ls,email_fix,email_y,count
0,0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,...,0.0,1.0,Post Graduate,N,...,(SC),17,raja.andimuthu@gmail.com,,
1,0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,...,0.0,1.0,Post Graduate,N,...,(SC),17,a.raja@sansad.nic.in,,
2,1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,...,0.0,3.0,Graduate,N,Social Worker ...,(SC),17,anarayanaswamyanekal5@gmail.com,,
3,1,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,...,0.0,3.0,Graduate,N,Social Worker ...,(SC),17,a.narayanswamy@sansad.nic.in,,
4,2,2654,Dr.,Farooq,Abdullah,Male,Jammu and Kashmir National Conference,J&KNC,Jammu and Kashmir ...,Srinagar,...,1.0,3.0,Professional Graduate,N,...,,17,iamfarooq80@hotmail.com,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,504,511,Shri,Ravi Prakash,Verma,Male,Samajwadi Party,SP,Uttar Pradesh ...,Kheri ...,...,1.0,1.0,Graduate,N,"Agriculturist, Teacher, Educationist ...",,12,rpverma@sansad.nic.in,,
3550,508,3824,Shri,Bhanu Pratap Singh,Verma,Male,Bharatiya Janata Party,BJP,Uttar Pradesh ...,Jalaun,...,5.0,0.0,Post Graduate,N,Farmer ...,(SC),12,bhanus@sansad.nic.in,bhanus@sansad.nic.in,1.0
3551,516,523,Shri,Balram Singh,Yadav,Male,Samajwadi Party,SP,Uttar Pradesh ...,Mainpuri ...,...,2.0,1.0,Graduate,N,"Advocate, Social Worker ...",,12,balramsy@sansad.nic.in,balramsy@sansad.nic.in,3.0
3552,527,533,Shri,Kinjarapu,Yerrannaidu,Male,Telugu Desam Party,TDP,Andhra Pradesh ...,Srikakulam ...,...,1.0,1.0,Graduate,N,Advocate ...,,12,yerran@sansad.nic.in,,


In [9]:
df_joined['email_fix'][df_joined['count'].isna()]

0              raja.andimuthu@gmail.com
1                  a.raja@sansad.nic.in
2       anarayanaswamyanekal5@gmail.com
3          a.narayanswamy@sansad.nic.in
4               iamfarooq80@hotmail.com
                     ...               
3545              varmarl@sansad.nic.in
3547                    m.vasava@nic.in
3549              rpverma@sansad.nic.in
3552               yerran@sansad.nic.in
3553               zahedi@sansad.nic.in
Name: email_fix, Length: 1699, dtype: object

In [10]:
### Fill NAs with 0s
df_joined['count'] = df_joined['count'].fillna(0)

In [11]:
df_joined['count'].mean()

1.1738885762521103

In [12]:
gdf = df_joined.groupby(['ls', 'index']).sum().reset_index()
gdf['count'].describe()

count    2315.00000
mean        1.80216
std         2.71296
min         0.00000
25%         0.00000
50%         1.00000
75%         2.00000
max        23.00000
Name: count, dtype: float64

In [13]:
gdf

Unnamed: 0,ls,index,mpsno,lastLoksabha,age,noOfTerms,numberOfSons,numberOfDaughters,count
0,12,0,2,14,53.0,3.0,2.0,0.0,1.0
1,12,1,4,15,81.0,9.0,1.0,2.0,5.0
2,12,2,5,14,74.0,3.0,0.0,2.0,0.0
3,12,3,18,32,192.0,14.0,2.0,2.0,2.0
4,12,6,151,13,65.0,2.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2310,17,534,10126,34,94.0,2.0,2.0,2.0,2.0
2311,17,535,10294,34,138.0,2.0,2.0,2.0,2.0
2312,17,536,10434,34,106.0,2.0,6.0,0.0,0.0
2313,17,537,5539,17,0.0,1.0,0.0,0.0,2.0
