# Advanced XGBoost Model with NLP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import tensorflow as tf
import tensorflow_hub as hub
import xgboost as xgb

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score

from imblearn.over_sampling import SMOTE

%matplotlib inline

Using TensorFlow backend.


In [29]:
def confusion_reporting(true_values, pred_values):
    '''
    This function takes in the true values of a dataset and the predicted values
    of the dataset and prints out a classification report, accuracy score, and
    plots the confusion matrix of the true and predicted values for simple analysis
    '''
    print(confusion_matrix(true_values, pred_values))
    print(classification_report(true_values, pred_values))
    print('Accuracy score:', round(accuracy_score(true_values, pred_values), 4))
    print('F1 score:', round(f1_score(true_values, pred_values), 4))

    cm = confusion_matrix(true_values, pred_values)
    df_cm = pd.DataFrame(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis],
                         index=['F', 'T'],
                         columns=['F', 'T'])
    plt.figure(figsize=(7, 5))
    sns.heatmap(df_cm, annot=True, cmap='Greens', vmin=0, vmax=1)
    plt.xlabel('Pred Val')
    plt.ylabel('True Val')
    plt.show()

In [30]:
main_df = pd.read_pickle('main_df.pkl')

display(main_df.head())
display(main_df.info())

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,state_abbrev,bioname,bioguide_id,born,nominate_dim1,...,cosponsor_my_party,cosponsors^2,cosponsors_D^2,cosponsors_R^2,cosponsors_ID^2,cosponsor_my_party^2,party_D,sponsor_party_D,party_R,sponsor_party_R
5,113,Senate,1,14009,1,MS,"COCHRAN, William Thad",C000567,1937,0.287,...,1,4.0,1,1,0,1,0,1,1,0
12,113,Senate,1,14203,1,MT,"BAUCUS, Max Sieben",B000243,1941,-0.212,...,1,4.0,1,1,0,1,1,1,0,0
18,113,Senate,1,14226,1,IA,"GRASSLEY, Charles Ernest",G000386,1933,0.346,...,1,4.0,1,1,0,1,0,1,1,0
24,113,Senate,1,14230,1,IA,"HARKIN, Thomas Richard (Tom)",H000206,1939,-0.351,...,1,4.0,1,1,0,1,1,1,0,0
30,113,Senate,1,14307,1,VT,"LEAHY, Patrick Joseph",L000174,1940,-0.361,...,1,4.0,1,1,0,1,1,1,0,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 102692 entries, 5 to 306260
Data columns (total 78 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   congress                       102692 non-null  int64  
 1   chamber                        102692 non-null  object 
 2   rollnumber                     102692 non-null  int64  
 3   icpsr                          102692 non-null  int64  
 4   cast_code                      102692 non-null  int64  
 5   state_abbrev                   102692 non-null  object 
 6   bioname                        102692 non-null  object 
 7   bioguide_id                    102692 non-null  object 
 8   born                           102692 non-null  int64  
 9   nominate_dim1                  102692 non-null  float64
 10  nominate_dim2                  102692 non-null  float64
 11  party                          102692 non-null  object 
 12  last_name                     

None

#### Drop all roll numbers excepts last for each bill

In [31]:
last_roll_per_bill = main_df.groupby(['congress', 'bill_number']).rollnumber.max().reset_index()
last_roll_per_bill.columns = ['congress', 'bill_number', 'last_rollcall']
main_df_last_roll = main_df.merge(last_roll_per_bill, how='left', on=['congress', 'bill_number'])

main_df = main_df_last_roll[main_df_last_roll['rollnumber'] == main_df_last_roll['last_rollcall']]

main_df = main_df.reset_index(drop=True)

display(main_df.head())
display(main_df.tail())
display(main_df.info())

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,state_abbrev,bioname,bioguide_id,born,nominate_dim1,...,cosponsors^2,cosponsors_D^2,cosponsors_R^2,cosponsors_ID^2,cosponsor_my_party^2,party_D,sponsor_party_D,party_R,sponsor_party_R,last_rollcall
0,113,Senate,1,14009,1,MS,"COCHRAN, William Thad",C000567,1937,0.287,...,4.0,1,1,0,1,0,1,1,0,1
1,113,Senate,1,14203,1,MT,"BAUCUS, Max Sieben",B000243,1941,-0.212,...,4.0,1,1,0,1,1,1,0,0,1
2,113,Senate,1,14226,1,IA,"GRASSLEY, Charles Ernest",G000386,1933,0.346,...,4.0,1,1,0,1,0,1,1,0,1
3,113,Senate,1,14230,1,IA,"HARKIN, Thomas Richard (Tom)",H000206,1939,-0.351,...,4.0,1,1,0,1,1,1,0,0,1
4,113,Senate,1,14307,1,VT,"LEAHY, Patrick Joseph",L000174,1940,-0.361,...,4.0,1,1,0,1,1,1,0,0,1


Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,state_abbrev,bioname,bioguide_id,born,nominate_dim1,...,cosponsors^2,cosponsors_D^2,cosponsors_R^2,cosponsors_ID^2,cosponsor_my_party^2,party_D,sponsor_party_D,party_R,sponsor_party_R,last_rollcall
23221,116,Senate,508,49300,1,CA,"FEINSTEIN, Dianne",F000062,1933,-0.268,...,136161.0,40401,28224,0,40401,1,1,0,0,508
23222,116,Senate,508,49308,1,WA,"MURRAY, Patty",M001111,1950,-0.35,...,136161.0,40401,28224,0,40401,1,1,0,0,508
23223,116,Senate,508,49703,1,ME,"COLLINS, Susan Margaret",C001035,1952,0.112,...,136161.0,40401,28224,0,28224,0,1,1,0,508
23224,116,Senate,508,49706,1,WY,"ENZI, Michael B.",E000285,1944,0.544,...,136161.0,40401,28224,0,28224,0,1,1,0,508
23225,116,Senate,508,94659,1,AL,"SHELBY, Richard C.",S000320,1934,0.428,...,136161.0,40401,28224,0,28224,0,1,1,0,508


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23226 entries, 0 to 23225
Data columns (total 79 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   congress                       23226 non-null  int64  
 1   chamber                        23226 non-null  object 
 2   rollnumber                     23226 non-null  int64  
 3   icpsr                          23226 non-null  int64  
 4   cast_code                      23226 non-null  int64  
 5   state_abbrev                   23226 non-null  object 
 6   bioname                        23226 non-null  object 
 7   bioguide_id                    23226 non-null  object 
 8   born                           23226 non-null  int64  
 9   nominate_dim1                  23226 non-null  float64
 10  nominate_dim2                  23226 non-null  float64
 11  party                          23226 non-null  object 
 12  last_name                      23226 non-null 

None

### Define cols to use in model

In [32]:
dep_col = 'cast_code'

senator_info = ['nominate_dim1', 'nominate_dim2', 'percent_campaign_vote', 'election_year', 'tenure', 
                'age', 'is_sponsor', 'sponsor_is_same_party', 'party_D', 'party_R', 'cosponsor_my_party', 
                'cosponsor_my_party^2']

bill_info = ['sponsor_party_is_lead', 'cosponsor_party_D_%', 'cosponsor_party_R_%', 
             'percent_cosponsors_lead_party', 'lead_party_D', 'sponsor_party_D', 'sponsor_party_R',
             'cosponsors', 'cosponsors_D', 'cosponsors_R', 'cosponsors_ID',
             'cosponsors^2', 'cosponsors_D^2', 'cosponsors_R^2', 'cosponsors_ID^2']

text_cols = ['summary']

indep_cols = senator_info + bill_info + text_cols

In [33]:
scalar = StandardScaler()

y = main_df[dep_col]
X = main_df[indep_cols]

non_text_cols = [x for x in indep_cols if x not in text_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train_sc = scalar.fit_transform(X_train[non_text_cols])
X_test_sc = scalar.transform(X_test[non_text_cols])

X_train_df = pd.DataFrame(X_train_sc, index=y_train.index, columns=X[non_text_cols].columns).join(X_train[text_cols])
X_test_df = pd.DataFrame(X_test_sc, index=y_test.index, columns=X[non_text_cols].columns).join(X_test[text_cols])

In [34]:
unique_sum_train_df = pd.Series(X_train_df['summary'].unique())
unique_sum_test_df = pd.Series(X_test_df['summary'].unique())

In [60]:
X_train_df

Unnamed: 0,nominate_dim1,nominate_dim2,percent_campaign_vote,election_year,tenure,age,is_sponsor,sponsor_is_same_party,party_D,party_R,...,sponsor_party_R,cosponsors,cosponsors_D,cosponsors_R,cosponsors_ID,cosponsors^2,cosponsors_D^2,cosponsors_R^2,cosponsors_ID^2,summary
16596,0.779598,0.953630,0.500759,-0.411874,-0.276504,0.133386,-0.069432,1.022268,-0.980507,1.019127,...,0.705216,-0.524426,-0.366198,-0.494335,-0.375620,-0.230616,-0.183841,-0.268546,-0.324676,Establishes the congressional budget for the f...
7167,-1.011125,-0.240604,0.501934,-0.411874,2.051458,1.587519,-0.069432,-0.978217,1.019880,-0.981232,...,0.705216,-0.060287,-0.272634,0.209397,-0.375620,-0.191652,-0.182422,-0.161669,-0.324676,(This measure has not been amended since it wa...
9014,1.145519,-0.765343,-1.060854,-0.411874,-0.609070,-1.805457,-0.069432,1.022268,-0.980507,1.019127,...,0.705216,-0.524426,-0.366198,-0.494335,-0.375620,-0.230616,-0.183841,-0.268546,-0.324676,"TITLE I--HEALTH, EDUCATION, LABOR, AND PENSION..."
9184,-0.853322,-1.228562,0.297479,-0.411874,0.499483,-0.545209,-0.069432,-0.978217,1.019880,-0.981232,...,0.705216,-0.524426,-0.366198,-0.494335,-0.375620,-0.230616,-0.183841,-0.268546,-0.324676,Every Student Succeeds Act TITLE I--IMPROVING ...
22256,-0.942515,0.200901,-0.664870,-0.411874,1.053760,0.715039,-0.069432,-0.978217,1.019880,-0.981232,...,0.705216,-0.524426,-0.366198,-0.494335,-0.375620,-0.230616,-0.183841,-0.268546,-0.324676,This resolution sets forth the rules governing...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,0.813903,2.082724,0.709914,-0.411874,2.273168,1.975288,-0.069432,1.022268,-0.980507,1.019127,...,0.705216,-0.181367,-0.366198,0.103837,-0.375620,-0.209329,-0.183841,-0.191327,-0.324676,(This measure has not been amended since it wa...
19648,0.754441,0.182806,-0.368763,-0.411874,0.499483,1.199750,-0.069432,1.022268,-0.980507,1.019127,...,0.705216,1.170692,1.037266,0.807568,3.521528,0.289106,0.135455,0.097240,3.996719,This resolution celebrates the 100th anniversa...
9845,-0.290719,1.829402,0.311579,-0.411874,-0.498215,0.618097,-0.069432,-0.978217,1.019880,-0.981232,...,0.705216,1.473391,1.006078,1.370553,3.521528,0.491294,0.121421,0.481997,3.996719,(This measure has not been amended since it wa...
10799,-0.969959,-0.971620,-0.513291,-0.411874,1.053760,0.908924,-0.069432,-0.978217,1.019880,-0.981232,...,0.705216,-0.463886,-0.303822,-0.459148,-0.375620,-0.229953,-0.183210,-0.268278,-0.324676,(This measure has not been amended since the S...


### Text Vectorization

In [35]:
# Load cached model
module_path = 'universal-sentence-encoder_4'
embed = hub.load(module_path)

In [36]:
# Create embeddings 
X_train_embeddings = embed(unique_sum_train_df.values)
X_test_embeddings = embed(unique_sum_test_df.values)

In [37]:
X_train_embeddings

<tf.Tensor: shape=(245, 512), dtype=float32, numpy=
array([[ 0.03055005, -0.05715709,  0.00957497, ...,  0.01627482,
         0.04436757,  0.055716  ],
       [-0.00920579, -0.04992998, -0.04987555, ..., -0.04683334,
        -0.04994818, -0.04940972],
       [-0.05184007, -0.05204987, -0.02980429, ...,  0.05189972,
        -0.05062379,  0.04335627],
       ...,
       [-0.01197152, -0.07224006,  0.05763957, ...,  0.0503172 ,
         0.03344295,  0.00411961],
       [ 0.03292277, -0.01363321, -0.00461916, ...,  0.03332947,
        -0.04796211,  0.05001941],
       [-0.05092446, -0.05869472, -0.05279854, ..., -0.05145788,
        -0.05134519,  0.05795386]], dtype=float32)>

In [64]:
#turn the embeddings into dataframes to rejoin with the unique summaries
sum_cols = ['sum_'+str(i) for i in range(np.shape(X_train_embeddings)[1])]

vec_sum_train_df = unique_sum_train_df.to_frame(name='summary').join(pd.DataFrame(np.asarray(X_train_embeddings), 
                                                                                  columns=sum_cols))
vec_sum_test_df = unique_sum_test_df.to_frame(name='summary').join(pd.DataFrame(np.asarray(X_test_embeddings), 
                                                                                columns=sum_cols))

In [65]:
X_train_df['copy_index'] = X_train_df.index
X_test_df['copy_index'] = X_test_df.index

X_train_vec = X_train_df.merge(vec_sum_train_df, on='summary', how='left').set_index('copy_index')
X_test_vec = X_test_df.merge(vec_sum_test_df, on='summary', how='left').set_index('copy_index')

### Run Model

In [67]:
# Define pipeline
clf_xgb = xgb.sklearn.XGBClassifier(nthread=-1, seed=1234, learning_rate =0.1,
                                    n_estimators=100,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8,
                                    objective= 'binary:logistic',
                                    scale_pos_weight=1)

In [76]:
#define cols for training
model_cols = sum_cols

In [77]:
model_cols

['sum_0',
 'sum_1',
 'sum_2',
 'sum_3',
 'sum_4',
 'sum_5',
 'sum_6',
 'sum_7',
 'sum_8',
 'sum_9',
 'sum_10',
 'sum_11',
 'sum_12',
 'sum_13',
 'sum_14',
 'sum_15',
 'sum_16',
 'sum_17',
 'sum_18',
 'sum_19',
 'sum_20',
 'sum_21',
 'sum_22',
 'sum_23',
 'sum_24',
 'sum_25',
 'sum_26',
 'sum_27',
 'sum_28',
 'sum_29',
 'sum_30',
 'sum_31',
 'sum_32',
 'sum_33',
 'sum_34',
 'sum_35',
 'sum_36',
 'sum_37',
 'sum_38',
 'sum_39',
 'sum_40',
 'sum_41',
 'sum_42',
 'sum_43',
 'sum_44',
 'sum_45',
 'sum_46',
 'sum_47',
 'sum_48',
 'sum_49',
 'sum_50',
 'sum_51',
 'sum_52',
 'sum_53',
 'sum_54',
 'sum_55',
 'sum_56',
 'sum_57',
 'sum_58',
 'sum_59',
 'sum_60',
 'sum_61',
 'sum_62',
 'sum_63',
 'sum_64',
 'sum_65',
 'sum_66',
 'sum_67',
 'sum_68',
 'sum_69',
 'sum_70',
 'sum_71',
 'sum_72',
 'sum_73',
 'sum_74',
 'sum_75',
 'sum_76',
 'sum_77',
 'sum_78',
 'sum_79',
 'sum_80',
 'sum_81',
 'sum_82',
 'sum_83',
 'sum_84',
 'sum_85',
 'sum_86',
 'sum_87',
 'sum_88',
 'sum_89',
 'sum_90',
 'sum_91'

In [74]:
# Fit model
clf_xgb.fit(X_train_vec[model_cols], y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=1234, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=1234, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [79]:
# Test accuracy on test data
predictions_xgb = clf_xgb.predict_proba(X_test_vec[model_cols])
predictions_xgb = [item[1] for item in predictions_xgb]

In [80]:
ROC = roc_auc_score(y_test, predictions_xgb)
print("ROC_Test: %.2f%%" % (ROC * 100.0))

ROC_Test: 75.12%


# What is the point of this?

In [None]:
# Spot-check results on validation sample
spot_check = data_validation
validation_probability_predictions_XGB = clf_xgb.predict_proba(X_validation_embeddings)[:, 1]
spot_check['prediction_probabilities_XGB'] = validation_probability_predictions_XGB
