# Data exploration

In [1]:
# Preamble
%run Preamble.ipynb

# Load data
bills_df = pd.read_csv("processed_data.csv")
#bills_df

  bills_df = pd.read_csv("processed_data.csv")


Will a bill become a law?  In this Congress it happens especially rarely.  How much rarer than when it passes in the House or the Senate?

In [2]:
# Some probabilities

# bills
print("Number of observations:")
num_of_bills = len(bills_df)
print(num_of_bills)

# Laws
print("\nNumber of bills that became law:")
num_of_laws = bills_df["became_law.signed"].value_counts()[1]
print(num_of_laws)
print("Probability that a bill in the 118th Congress becomes a law:")
prob_law = num_of_laws/num_of_bills
print(prob_law)

# House
print("\nNumber of bills that passed in the House:")
num_of_house_pass = bills_df["pass_house.passed"].value_counts()[1]
print(num_of_house_pass)
print("Probability that a bill passes in the House:")
prob_house = num_of_house_pass/num_of_bills
print(prob_house)
print("Probability that a bill that passes in the House becomes a law:")
prob_law_given_house = num_of_laws/num_of_house_pass
print(prob_law_given_house)

# Senate
print("\nNumber of bills that passed in the Senate:")
num_of_senate_pass = bills_df["pass_senate.passed"].value_counts()[1]
print(num_of_senate_pass)
print("Probability that a bill passes in the Senate:")
prob_senate = num_of_senate_pass/num_of_bills
print(prob_senate)
print("Probability that a bill that passes in the Senate becomes a law:")
prob_law_given_senate = num_of_laws/num_of_senate_pass
print(prob_law_given_senate)

# both
print("\nNumber of bills that passed in both chambers:")
both_list = []
for i in range(len(bills_df)):
    if bills_df.iloc[i,bills_df.columns.get_loc("pass_house.passed")] == 1 and \
    bills_df.iloc[i,bills_df.columns.get_loc("pass_senate.passed")] == 1:
        both_list.append(1)
    else:
        both_list.append(0)
num_of_both_pass = both_list.count(1)
print(num_of_both_pass)
print("Probability that a bill that passed in both chambers will become law:")
prob_law_given_both = num_of_laws/num_of_both_pass
print(prob_law_given_both)

Number of observations:
15366

Number of bills that became law:
64
Probability that a bill in the 118th Congress becomes a law:
0.004165039698034622

Number of bills that passed in the House:
539
Probability that a bill passes in the House:
0.03507744370688533
Probability that a bill that passes in the House becomes a law:
0.11873840445269017

Number of bills that passed in the Senate:
189
Probability that a bill passes in the Senate:
0.012299882858258494
Probability that a bill that passes in the Senate becomes a law:
0.3386243386243386

Number of bills that passed in both chambers:
81
Probability that a bill that passed in both chambers will become law:
0.7901234567901234


If we are just looking at whether any bill becomes law, it is a classification problem with *very* imbalanced classes.  We can reduce this problem by restricting to bills that pass in the House, Senate, or both -- but at the expense of a lot less data.

In [3]:
# Reduce the data sets

bills_df_house = bills_df.loc[bills_df["pass_house.passed"] == 1]
print("House:")
print(len(bills_df_house))

bills_df_senate = bills_df.loc[bills_df["pass_senate.passed"] == 1]
print("Senate:")
print(len(bills_df_senate))

bills_df_both = bills_df[(bills_df["pass_house.passed"] == 1) & (bills_df["pass_senate.passed"] == 1)]
print("Both:")
print(len(bills_df_both))

House:
539
Senate:
189
Both:
81


## Feature selection

In [4]:
# How many features?

print("Initial number of features:")
print(len(bills_df.axes[1]))

Initial number of features:
2382


2376 is too many!  One immediate technique to reduce dimensions is PCA.  It's not recommended for data that's mostly binary, but we can try it.

### PCA

Let's just see if any features are even correlated with a bill becoming a law.

In [5]:
# Make the PCA objects
scaler = StandardScaler()
pca = PCA()

# Scale, fit
scaled_law = scaler.fit_transform(bills_df)
pca.fit(scaled_law)
evr_law = pca.explained_variance_ratio_
comp_vects_law = pd.DataFrame(pca.components_.transpose(), index = bills_df.columns)
#print("With became_law.signed:")
#comp_vects_law.sort_values(by = comp_vects_law.columns[0], ascending = False)

ValueError: could not convert string to float: 'Voice vote'

In [None]:
# First n components with contributions
cor_fea_all = cor_feat_pca(bills_df, 5, comp_vects_law)
pprint.pprint(cor_fea_all, sort_dicts = False)

Seems to show that parties tend to vote and sponsor along party lines, and whether a bill passes is somewhat correlated to the Republicans who vote "yea", which makes sense, since Republicans have the majority in the House.  No features seem obviously correlated to whether a bill becomes law, though.

We can be conservative about cutting out some features and still cut out a bit.  The top contribution to the first component is 0.047, and pass_house.votes.R has that value, so we can just delete all the "yea" Republicans with that score.  We'll do similar reductions with the other components

In [None]:
to_remove = []
for feature in cor_fea_all["Component1"]:
    if feature[1] == 0.047 and feature[0] != "pass_house.votes.Yea.R":
        to_remove.append(feature[0])
for feature in cor_fea_all["Component2"]:
    if feature[1] >= 0.054 and feature[0] != "pass_house.votes.Nay.D":
        to_remove.append(feature[0])  
for feature in cor_fea_all["Component3"]:
    if feature[1] >= 0.078 and feature[0] != "pass_house.votes.Nay.R":
        to_remove.append(feature[0])        
bills_df = bills_df.drop(columns = to_remove)        

Now lets trim all the data sets:

In [None]:
bills_df_house = bills_df.loc[bills_df["pass_house.passed"] == 1]
bills_df_senate = bills_df.loc[bills_df["pass_senate.passed"] == 1]
bills_df_both = bills_df[(bills_df["pass_house.passed"] == 1) & (bills_df["pass_senate.passed"] == 1)]

print("Number of remaining features:")
print(len(bills_df.axes[1]))

In [None]:
# PCA
#
# Drop the dependent variable and the redundant variables from the data frame
#bills_df_PCA = bills_df.drop(columns=["became_law.signed"])
#bills_df_house_PCA = bills_df_house.drop(columns=["became_law.signed", "pass_house.passed"])
#bills_df_senate_PCA = bills_df_senate.drop(columns=["became_law.signed", "pass_senate.passed"])
#bills_df_both_PCA = bills_df_both.drop(columns=["became_law.signed", 
#    "pass_house.passed", "pass_senate.passed"])

# Scale
#scaled = scaler.fit_transform(bills_df_PCA)
#scaled_house = scaler.fit_transform(bills_df_house_PCA)
#scaled_senate = scaler.fit_transform(bills_df_senate_PCA)
#scaled_both = scaler.fit_transform(bills_df_both_PCA)
#
# Fit, components, explained variance ratios
#
# all
#pca.fit(scaled)
#evr = pca.explained_variance_ratio_
#comp_vects = pd.DataFrame(pca.components_.transpose(), index = bills_df_PCA.columns)
#
# House
#pca.fit(scaled_house)
#evr_house = pca.explained_variance_ratio_
#comp_vects_house = pd.DataFrame(pca.components_.transpose(), index = bills_df_house_PCA.columns)
#
# Senate
#pca.fit(scaled_senate)
#evr_senate = pca.explained_variance_ratio_
#comp_vects_senate = pd.DataFrame(pca.components_.transpose(), index = bills_df_senate_PCA.columns)
#
# both
#pca.fit(scaled_both)
#evr_both = pca.explained_variance_ratio_
#comp_vects_both = pd.DataFrame(pca.components_.transpose(), index = bills_df_both_PCA.columns)

In [None]:
#print("All bills:")
#comp_vects.sort_values(by = comp_vects.columns[0], ascending = False)

In [None]:
#print("House:")
#comp_vects_house.sort_values(by = comp_vects_house.columns[0], ascending = False)

In [None]:
#print("Senate:")
#comp_vects_senate.sort_values(by = comp_vects_senate.columns[0], ascending = False)

In [None]:
#print("Both:")
#comp_vects_both.sort_values(by = comp_vects_both.columns[0], ascending = False)

In [None]:
# Add an additional component's explained variance ratio 
#for i in range(1,len(bills_df_PCA.columns.values.tolist())+1,1):
#    if round(sum(evr[0:i]),3)*100 <= 80 and round(sum(evr[0:i]),3)*100 > 75:
#        print("First "+str(i)+" component(s) explain "+str(round(sum(evr[0:i]),3)*100)+"% of the variance in the features.\n")

### Prince package and FAMD

The categorical data analog to PCA is MCA, or multiple correspondence analysis.  However, our data frame contains both categorical and numerical data.  The `prince` package can handle different combinations of data, summarized in the flow chart on [this page](https://github.com/MaxHalford/prince/blob/master/README.md).  In our case, we use FAMD (factor analysis of mixed data).

In [None]:
# Drop the dependent variable and the redundant variables from the data frames
bills_df_dr = bills_df.drop(columns=["became_law.signed"])
bills_df_house_dr = bills_df_house.drop(columns=["became_law.signed", "pass_house.passed"])
bills_df_senate_dr = bills_df_senate.drop(columns=["became_law.signed", "pass_senate.passed"])
bills_df_both_dr = bills_df_both.drop(columns=["became_law.signed", 
    "pass_house.passed", "pass_senate.passed"])

In [None]:
# Create the objects (uses defaluts from https://maxhalford.github.io/prince/famd/, except for 
# n_components)
# Have to do it for each data frame to avoid an error
famd_all = prince.FAMD(
    n_components=10,
    n_iter=3,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="error"  # same parameter as sklearn.preprocessing.OneHotEncoder
)
famd_house = prince.FAMD(
    n_components=10,
    n_iter=3,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="error"  # same parameter as sklearn.preprocessing.OneHotEncoder
)
famd_senate = prince.FAMD(
    n_components=10,
    n_iter=3,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="error"  # same parameter as sklearn.preprocessing.OneHotEncoder
)
famd_both = prince.FAMD(
    n_components=10,
    n_iter=3,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="error"  # same parameter as sklearn.preprocessing.OneHotEncoder
)

# Fit
print("(fit all)") # for debugging
s_fa = time.time() # for debugging
famd_all = famd_all.fit(bills_df_dr)
e_fa = time.time() # for debugging
time_elapsed(s_fa, e_fa) # for debugging

print("(fit House)") # for debugging
s_fh = time.time() # for debugging
famd_house = famd_house.fit(bills_df_house_dr)
e_fh = time.time() # for debugging
time_elapsed(s_fh, e_fh) # for debugging

print("(fit Senate)") # for debugging
s_fs = time.time() # for debugging
famd_senate = famd_senate.fit(bills_df_senate_dr)
e_fs = time.time() # for debugging
time_elapsed(s_fs, e_fs) # for debugging

print("(fit both)") # for debugging
s_fb = time.time() # for debugging
famd_both = famd_both.fit(bills_df_both_dr)
e_fb = time.time() # for debugging
time_elapsed(s_fb, e_fb) # for debugging

In [None]:
# Eigenvalues

print("Components (all):")
print(famd_all.eigenvalues_summary)
print("Components (House):")
print(famd_house.eigenvalues_summary)
print("Components (Senate):")
print(famd_senate.eigenvalues_summary)
print("Components (both):")
print(famd_both.eigenvalues_summary)

In [None]:
# Contributions

print("All:")
(
    famd_all.column_contributions_
    .sort_values(0, ascending=False)
    #.head(10)
    .style.format('{:.8%}')
)

In [None]:
print("House:")
(
    famd_house.column_contributions_
    .sort_values(0, ascending=False)
    #.head(10)
    .style.format('{:.8%}')
)

In [None]:
print("Senate:")
(
    famd_senate.column_contributions_
    .sort_values(0, ascending=False)
    #.head(10)
    .style.format('{:.8%}')
)

In [None]:
print("Both:")
(
    famd_both.column_contributions_
    .sort_values(0, ascending=False)
    #.head(10)
    .style.format('{:.8%}')
)

### Further heuristic dimension reduction

In [None]:
# Don't do this until severe dimension reduction
# Or look for correlation between pairs of features
#
#start_cor = time.time() # for debugging
#cor_mtx = bills_df_PCA.corr()
#min_corr_coef = 0.95 #input("How correlated do you want the variables (should be between -1 and 1)?")
#corr_pairs = []
#for i in range(len(bills_df_PCA.columns.values.tolist())):
#    for j in range(i,len(bills_df_PCA.columns.values.tolist()),1):
#        if (cor_mtx.iloc[i,j] >= min_corr_coef or cor_mtx.iloc[i,j] <= -min_corr_coef) \
#            and cor_mtx.iloc[i,j] != 1 and cor_mtx.iloc[i,j] != -1: 
#            corr_pairs.append([cor_mtx.index.tolist()[i], cor_mtx.columns.tolist()[j], round(cor_mtx.iloc[i,j],3)])
#print("Correlated pairs of features are:")
#sorted(corr_pairs, key = lambda x: abs(float(x[2])), reverse=True)
#end_cor = time.time() # for debugging
#time_elapsed(start_cor, end_corr) # for debugging

In [None]:
# Trim/merge some features, based on correlations

# Combine Senate Democrat and Independent yeas and nays
#bills_df["pass_senate.votes.Yea.DI"] = bills_df["pass_senate.votes.Yea.D"]
#+bills_df["pass_senate.votes.Yea.I"]
#bills_df["pass_senate.votes.Nay.DI"] = bills_df["pass_senate.votes.Nay.D"]
#+bills_df["pass_senate.votes.Nay.I"]

# Remove unwanted features
#remove_features = ["chamber.S", "pass_house.passed", "pass_senate.passed", "pass_senate.votes.Yea.D", 
#    "pass_senate.votes.Yea.I", "pass_senate.votes.Nay.D", "pass_senate.votes.Nay.I"]
#bills_df = bills_df.drop(columns=remove_features)
#bills_df.columns.values.tolist()

## Export to a file

In [None]:
# Export to a file for model tests
bills_df.to_csv("FinalDataSets//final_data.csv", index=False)
bills_df_house.to_csv("FinalDataSets//final_data_house.csv", index=False)
bills_df_senate.to_csv("FinalDataSets//final_data_senate.csv", index=False)
bills_df_both.to_csv("FinalDataSets//final_data_both.csv", index=False)