In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
df = pd.read_csv('data/cleaned_all_transactions.csv').iloc[:, 1:]
df.head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd,state,amount_cleaned
0,2021,2021-10-04,2021-09-27,joint,BP,BP plc,purchase,"$1,001 - $15,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,8000.5
1,2021,2021-10-04,2021-09-13,joint,XOM,Exxon Mobil Corporation,purchase,"$1,001 - $15,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,8000.5
2,2021,2021-10-04,2021-09-10,joint,ILPT,Industrial Logistics Properties Trust - Common...,purchase,"$15,001 - $50,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,32500.5
3,2021,2021-10-04,2021-09-28,joint,PM,Phillip Morris International Inc,purchase,"$15,001 - $50,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,32500.5
4,2021,2021-10-04,2021-09-17,self,BLK,BlackRock Inc,sale_partial,"$1,001 - $15,000",Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False,CA,8000.5


### **All the nulls:**

In [3]:
df[df['asset_description'].isnull()].shape[0]

4

In [4]:
df[df['owner'].isnull()].shape[0]

5333

In [5]:
df[df['ticker'].isnull()].shape[0]

1141

In [6]:
df[df['transaction_date'].isnull()].shape[0]

5

## Let's begin permutation testing!

In [103]:
df_copy = df.copy()
df_copy['owner_missing'] = df_copy['owner'].isna()
df_copy.head()

Unnamed: 0,disclosure_year,disclosure_date,transaction_date,owner,ticker,asset_description,type,amount,representative,district,ptr_link,cap_gains_over_200_usd,state,amount_cleaned,owner_missing
0,2021,2021-10-04,2021-09-27,joint,BP,BP plc,purchase,"$1,001 - $15,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,8000.5,False
1,2021,2021-10-04,2021-09-13,joint,XOM,Exxon Mobil Corporation,purchase,"$1,001 - $15,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,8000.5,False
2,2021,2021-10-04,2021-09-10,joint,ILPT,Industrial Logistics Properties Trust - Common...,purchase,"$15,001 - $50,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,32500.5,False
3,2021,2021-10-04,2021-09-28,joint,PM,Phillip Morris International Inc,purchase,"$15,001 - $50,000",Virginia Foxx,NC05,https://disclosures-clerk.house.gov/public_dis...,False,NC,32500.5,False
4,2021,2021-10-04,2021-09-17,self,BLK,BlackRock Inc,sale_partial,"$1,001 - $15,000",Alan S. Lowenthal,CA47,https://disclosures-clerk.house.gov/public_dis...,False,CA,8000.5,False


We will use ks_2samp on columns that are numerical

In [104]:
cols_to_choose = 'disclosure_year amount_cleaned'.split(" ")
cols_to_choose

['disclosure_year', 'amount_cleaned']

In [105]:
new_dict = {}
for col in cols_to_choose:
    # when 'owner' is missing 
    col_owner_mis = df_copy.loc[df_copy['owner_missing'], col]

    # when 'owner' is not missing
    col_owner_not_mis = df_copy.loc[~df_copy['owner_missing'], col]

    val = stats.ks_2samp(col_owner_mis, col_owner_not_mis)

    new_dict[col] = val
new_dict

{'disclosure_year': KstestResult(statistic=0.10445941940037393, pvalue=4.3112539767251837e-32),
 'amount_cleaned': KstestResult(statistic=0.02360406573781591, pvalue=0.0479210776360135)}

The p-val for `disclosure_year` and `amount_cleaned` is extremely low. This means that the distribution of `disclosure_year`, for instance, when `owner` is missing and the distribution of `disclosure_year` when `owner` is not missing are likely different, which means that the missingness of `owner` likely depends on `disclosure_year`. Same goes for `amount_cleaned`.

Let's find one more column where the depended-on column for missingness of `owner` is categorical. The test stat we need to use if the depended-on column is categorical is TVD.

In [108]:
shuffled = df.copy()
shuffled['owner_missing'] = shuffled['owner'].isna()
tvds = []
for _ in range(500):
    shuffled['district'] = np.random.permutation(shuffled['district'])    
    pivoted = (
        shuffled
        .pivot_table(index='owner_missing', columns='district', aggfunc='size')
        .apply(lambda x: x / x.sum(), axis=1)
    )
    tvd = pivoted.diff().iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)

In [109]:
df_copy = df.copy()
dist = (
    df_copy
    .assign(owner_missing=df_copy['owner'].isna())
    .pivot_table(index='district', columns='owner_missing', aggfunc='size')
)
dist = dist / dist.sum()
dist

owner_missing,False,True
district,Unnamed: 1_level_1,Unnamed: 2_level_1
AL02,,0.001125
AL05,0.004397,0.000938
AR02,0.000789,0.000375
AZ01,0.000226,
AZ03,0.000451,
...,...,...
WA04,,0.000188
WA08,0.004285,
WI08,0.004172,0.000188
WV01,0.008344,0.013876


In [110]:
obs_tvd = dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
obs_tvd

0.5957373230798173

In [111]:
pval = np.mean(tvds >= obs_tvd)
pval

0.0

Here, we see that the p-val is 0.0, implying that the distribution of `district` when `owner` is missing and the distribution of `district` when `owner` is not missing are likely different, so the missingness of `owner` likely depends on `district`.