# Preamble

## Modules

In [None]:
# .json files
import json

# Make data frames
import pandas as pd

# Make plots
import matplotlib.pyplot as plt

# PCA 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# PCA, MCA, FAMD
import prince

# Suppress warnings from prince
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Timer
import time # for debugging

# Print dictionaries prettily
import pprint

# For calculations
import numpy as np

# For plots
import matplotlib.pyplot as plt

# Split data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Logistic regression
from sklearn.linear_model import LogisticRegression

# Measure accuracy, precision, recall
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# roc curve and auc
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

## Functions

In [None]:
# Function for reporting the time elapsed

# Input is a start time and an end time
# Output is a print statement giving the time elapsed
def time_elapsed(start, end):
    # Compute time elapsed in seconds
    total_time_seconds = end-start 
    if total_time_seconds < 60:
        print("Time elapsed =",total_time_seconds, "seconds")
    else:
        # In minutes 
        total_time_minutes = total_time_seconds/60 
        if total_time_minutes < 60: 
            print("Time elapsed =", total_time_minutes, "minutes") 
        else: 
            # In hours
            total_time_hours = total_time_minutes/60 
            # Print the time elapsed in hours
            print("Time elapsed =", total_time_hours, "hours") 

### Data processing

In [None]:
# Add in a column to bills_df_all (in DataProcessing) with the lists of sponsors/voters by name
# Input is a column header for a column with dictionary entries for politicians (e.g., "sponsor(s).people")
# Output is the column of dictionaries turns into a column of lists
def col_of_lists(people):
    new_col = []
    for i in range(len(bills_df_all)):
        names = []
        if type(bills_df_all.iloc[i, bills_df_all.columns.get_loc(people)]) == float:
            names.append("none")
        elif type(bills_df_all.iloc[i, bills_df_all.columns.get_loc(people)]) == str:    
            names.append("unanimous")    
        elif type(bills_df_all.iloc[i, bills_df_all.columns.get_loc(people)]) == list:
            for person_dict in bills_df_all.iloc[i, bills_df_all.columns.get_loc(people)]:
                if "vote" in person_dict:
                    names.append(person_dict["vote"]+"."+person_dict["name"])
                else:    
                    names.append(person_dict["name"])
        new_col.append(names)        
    bills_df_all[people] = new_col

In [None]:
# Function that will one-hot a column of lists (input is a column header, 
# output is the concatenated bills_df_all (DataProcessing))
def one_hot_col(column):
    cols_add_dict = {}
    all_entries = list(set([item for sublist in bills_df_all[column] for item in sublist]))
    for entry in all_entries:
        #start_entry = time.time() # for debugging
        #print(entry) # for debugging
        col_for_entry = []
        for i in range(len(bills_df_all)):
            if entry in bills_df_all.iloc[i, bills_df_all.columns.get_loc(column)]:
                col_for_entry.append(1) 
            else:
                col_for_entry.append(0)
        cols_add_dict[column+"."+entry] = col_for_entry
        #end_entry = time.time() # for debugging
        #time_elapsed(start_entry, end_entry) # for debugging
    df = pd.DataFrame(cols_add_dict)
    return pd.concat([bills_df_all, df], axis=1)

### Data exploration

In [None]:
# Find the correlated features for the first n components of pca (don't make the number
# too high), output is a dictionary
def cor_feat_pca(PCA_cols, number_of_components, components_data_frame):
    correlations_dict = {}
    for i in range(0,number_of_components,1):
        this_ordered = components_data_frame.sort_values(
            by = components_data_frame.columns[i], ascending = False)
        correlations_dict["Component"+str(i+1)] = []
        for j in range(len(PCA_cols.columns.values.tolist())):
            correlations_dict["Component"+str(i+1)].append([this_ordered.axes[0].tolist()[j], 
                round(this_ordered.iloc[j,i],3)])
    return correlations_dict  