In [1]:
 %matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#congressional_db = pd.read_csv('../Resources/.csv', header=None, names=[])
#congressional_db.head()
congdf = pd.read_csv('Resources/housesessions109_118.csv')
congdf.head()

Unnamed: 0,uniqueid,fullname,gender,dob,party,stateabr,district,CongressSession,AgeWhenSessionStarted,sessionstartedon,sessionendedon,twitterhandle,joinedon,AgeWhenTwitterJoined,yearjoinedontwitter
0,McP-NC10,"McHenry, Patrick T.",M,1975-11-22,Republican,NC,10,109,29,2005-01-03,2007-01-03,PatrickMcHenry,2009-03-01,33.0,2009.0
1,PuA-FL12,"Putnam, Adam H.",M,1974-07-31,Republican,FL,12,109,30,2005-01-03,2007-01-03,adamputnam,2009-06-01,34.0,2009.0
2,RyT-OH13,"Ryan, Tim",M,1973-07-16,Democratic,OH,13,109,31,2005-01-03,2007-01-03,RepTimRyan,2008-02-01,34.0,2008.0
3,BoD-OK02,"Boren, Dan",M,1973-08-02,Democratic,OK,2,109,31,2005-01-03,2007-01-03,DanBorenOK,2020-04-01,46.0,2020.0
4,NuD-CA22,"Nunes, Devin",M,1973-10-01,Republican,CA,22,109,31,2005-01-03,2007-01-03,DevinNunes,2009-07-01,35.0,2009.0


In [3]:
#Clean Values
#Check Gender values
congdf['gender'].value_counts()

M    3519
F     927
f       4
Name: gender, dtype: int64

In [4]:
#Change Gender to Numerical Values (Male = 0),(Female = 1)
congdf['gender'] = congdf['gender'].replace(['M','F','f'],[0,1,1])
#congdf.loc[congdf['CongressSession']==110]


In [5]:
#Check party values
congdf['party'].value_counts()

Republican     2237
Democratic     2202
Independent       6
Libertarian       5
Name: party, dtype: int64

In [6]:
#Change Party (Republican = 0),(Democrat = 1),(Independent = 2),(Libertarian = 3)
congdf['party'] = congdf['party'].replace(['Republican','Democratic','Independent','Libertarian'],[0,1,2,3])

In [7]:
#Set gender and party as int
congdf = congdf.astype({'gender':'int','party':'int'})

In [8]:
#Separate into raw DataFrames by CongressSession
c109rawdf = congdf.loc[congdf['CongressSession']==109]
c110rawdf = congdf.loc[congdf['CongressSession']==110]
c111rawdf = congdf.loc[congdf['CongressSession']==111]
c112rawdf = congdf.loc[congdf['CongressSession']==112]
c113rawdf = congdf.loc[congdf['CongressSession']==113]
c114rawdf = congdf.loc[congdf['CongressSession']==114]
c115rawdf = congdf.loc[congdf['CongressSession']==115]
c116rawdf = congdf.loc[congdf['CongressSession']==116]
c117rawdf = congdf.loc[congdf['CongressSession']==117]
c118rawdf = congdf.loc[congdf['CongressSession']==118]


In [9]:
#Create Blank Series for Session 109 Twitter Existence
c109twitdf=[]
for i in range(len(c109rawdf)):
    #Check if Creation Date is null
    if pd.isna(c109rawdf.iloc[i,12]):
        c109twitdf.append(0)
    else:
        #If date joined before end of session
        if c109rawdf.iloc[i,12]<c109rawdf.iloc[i,10]:
            #Add Success (1) to series
            c109twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c109twitdf.append(0)

In [10]:
#Check Distribution of Twitter Accounts
pd.Series(c109twitdf).value_counts()

0    444
dtype: int64

In [11]:
#Same Steps for Session 110
c110twitdf=[]
for i in range(len(c110rawdf)):
    #Check if Creation Date is null
    if pd.isna(c110rawdf.iloc[i,12]):
        c110twitdf.append(0)
    else:
        #If date joined before end of session
        if c110rawdf.iloc[i,12]<c110rawdf.iloc[i,10]:
            #Add Success (1) to series
            c110twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c110twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c110twitdf).value_counts()

0    390
1     55
dtype: int64

In [12]:
#Session 111
c111twitdf=[]
for i in range(len(c111rawdf)):
    #Check if Creation Date is null
    if pd.isna(c111rawdf.iloc[i,12]):
        c111twitdf.append(0)
    else:
        #If date joined before end of session
        if c111rawdf.iloc[i,12]<c111rawdf.iloc[i,10]:
            #Add Success (1) to series
            c111twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c111twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c111twitdf).value_counts()

1    245
0    202
dtype: int64

In [34]:
#Cleaning Training Data for 111
#Combining previous years
c111traindf = c109rawdf.append(c110rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c111traindf = c111traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c111traintwitdf=[]
for i in range(len(c111traindf)):
    #Check if Creation Date is null
    if pd.isna(c111traindf.iloc[i,12]):
        c111traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c111traindf.iloc[i,12]<c111traindf.iloc[i,10]:
            #Add Success
            c111traintwitdf.append(1)
            #Correct Age in training set
            c111traindf.iloc[i,8]=c111traindf.iloc[i,13]
        else:
            c111traintwitdf.append(0)

0    457
1     55
dtype: int64

In [41]:
#Isolate Relevant Data for Training and Testing
c111train = c111traindf[['gender','party','AgeWhenSessionStarted']]
c111test = c111rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier111= LogisticRegression(max_iter = 10000)
#Train Model
classifier111.fit(c111train,c111traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier111.score(c111train,c111traintwitdf)}")
print(f"Testing Data Score: {classifier111.score(c111test,c111twitdf)}")

Training Data Score: 0.892578125
Testing Data Score: 0.4519015659955257


In [43]:
#Create Confusion Matrix for Session 111
true111 = c111twitdf
pred111 = classifier111.predict(c111test)
cm111 = confusion_matrix(true111,pred111)
#Turn into DataFrame
cm111df = pd.DataFrame(cm111,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm111df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,202,0
Actual 1,245,0


In [13]:
#Session 112
c112twitdf=[]
for i in range(len(c112rawdf)):
    #Check if Creation Date is null
    if pd.isna(c112rawdf.iloc[i,12]):
        c112twitdf.append(0)
    else:
        #If date joined before end of session
        if c112rawdf.iloc[i,12]<c112rawdf.iloc[i,10]:
            #Add Success (1) to series
            c112twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c112twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c112twitdf).value_counts()

1    354
0     92
dtype: int64

In [None]:
#Cleaning Training Data for 112
c112train = 

In [14]:
#Session 113
c113twitdf=[]
for i in range(len(c113rawdf)):
    #Check if Creation Date is null
    if pd.isna(c113rawdf.iloc[i,12]):
        c113twitdf.append(0)
    else:
        #If date joined before end of session
        if c113rawdf.iloc[i,12]<c113rawdf.iloc[i,10]:
            #Add Success (1) to series
            c113twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c113twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c113twitdf).value_counts()

1    397
0     48
dtype: int64

In [15]:
#Session 114
c114twitdf=[]
for i in range(len(c114rawdf)):
    #Check if Creation Date is null
    if pd.isna(c114rawdf.iloc[i,12]):
        c114twitdf.append(0)
    else:
        #If date joined before end of session
        if c114rawdf.iloc[i,12]<c114rawdf.iloc[i,10]:
            #Add Success (1) to series
            c114twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c114twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c114twitdf).value_counts()

1    410
0     34
dtype: int64

In [16]:
#Session 115
c115twitdf=[]
for i in range(len(c115rawdf)):
    #Check if Creation Date is null
    if pd.isna(c115rawdf.iloc[i,12]):
        c115twitdf.append(0)
    else:
        #If date joined before end of session
        if c115rawdf.iloc[i,12]<c115rawdf.iloc[i,10]:
            #Add Success (1) to series
            c115twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c115twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c115twitdf).value_counts()

1    425
0     22
dtype: int64

In [17]:
#Session 116
c116twitdf=[]
for i in range(len(c116rawdf)):
    #Check if Creation Date is null
    if pd.isna(c116rawdf.iloc[i,12]):
        c116twitdf.append(0)
    else:
        #If date joined before end of session
        if c116rawdf.iloc[i,12]<c116rawdf.iloc[i,10]:
            #Add Success (1) to series
            c116twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c116twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c116twitdf).value_counts()

1    432
0     11
dtype: int64

In [18]:
#Session 117
c117twitdf=[]
for i in range(len(c117rawdf)):
    #Check if Creation Date is null
    if pd.isna(c117rawdf.iloc[i,12]):
        c117twitdf.append(0)
    else:
        #If date joined before end of session
        if c117rawdf.iloc[i,12]<c117rawdf.iloc[i,10]:
            #Add Success (1) to series
            c117twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c117twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c117twitdf).value_counts()

1    443
0      6
dtype: int64

In [19]:
#Session 118
c118twitdf=[]
for i in range(len(c118rawdf)):
    #Check if Creation Date is null
    if pd.isna(c118rawdf.iloc[i,12]):
        c118twitdf.append(0)
    else:
        #If date joined before end of session
        if c118rawdf.iloc[i,12]<c118rawdf.iloc[i,10]:
            #Add Success (1) to series
            c118twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c118twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c118twitdf).value_counts()

1    437
0      3
dtype: int64