In [1]:
 %matplotlib inline
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#congressional_db = pd.read_csv('../Resources/.csv', header=None, names=[])
#congressional_db.head()
congdf = pd.read_csv('Resources/housesessions109_118.csv')
congdf.head()

Unnamed: 0,uniqueid,fullname,gender,dob,party,stateabr,district,CongressSession,AgeWhenSessionStarted,sessionstartedon,sessionendedon,twitterhandle,joinedon,AgeWhenTwitterJoined,yearjoinedontwitter
0,McP-NC10,"McHenry, Patrick T.",M,1975-11-22,Republican,NC,10,109,29,2005-01-03,2007-01-03,PatrickMcHenry,2009-03-01,33.0,2009.0
1,PuA-FL12,"Putnam, Adam H.",M,1974-07-31,Republican,FL,12,109,30,2005-01-03,2007-01-03,adamputnam,2009-06-01,34.0,2009.0
2,RyT-OH13,"Ryan, Tim",M,1973-07-16,Democratic,OH,13,109,31,2005-01-03,2007-01-03,RepTimRyan,2008-02-01,34.0,2008.0
3,BoD-OK02,"Boren, Dan",M,1973-08-02,Democratic,OK,2,109,31,2005-01-03,2007-01-03,DanBorenOK,2020-04-01,46.0,2020.0
4,NuD-CA22,"Nunes, Devin",M,1973-10-01,Republican,CA,22,109,31,2005-01-03,2007-01-03,DevinNunes,2009-07-01,35.0,2009.0


In [3]:
#Clean Values
#Check Gender values
congdf['gender'].value_counts()

M    3519
F     927
f       4
Name: gender, dtype: int64

In [4]:
#Change Gender to Numerical Values (Male = 0),(Female = 1)
congdf['gender'] = congdf['gender'].replace(['M','F','f'],[0,1,1])

In [5]:
#Check party values
congdf['party'].value_counts()

Republican     2237
Democratic     2202
Independent       6
Libertarian       5
Name: party, dtype: int64

In [6]:
#Change Party (Republican = 0),(Democrat = 1),(Independent = 2),(Libertarian = 3)
congdf['party'] = congdf['party'].replace(['Republican','Democratic','Independent','Libertarian'],[0,1,2,3])

In [7]:
#Set gender and party as int
congdf = congdf.astype({'gender':'int','party':'int'})

In [8]:
#Separate into raw DataFrames by CongressSession
c109rawdf = congdf.loc[congdf['CongressSession']==109]
c110rawdf = congdf.loc[congdf['CongressSession']==110]
c111rawdf = congdf.loc[congdf['CongressSession']==111]
c112rawdf = congdf.loc[congdf['CongressSession']==112]
c113rawdf = congdf.loc[congdf['CongressSession']==113]
c114rawdf = congdf.loc[congdf['CongressSession']==114]
c115rawdf = congdf.loc[congdf['CongressSession']==115]
c116rawdf = congdf.loc[congdf['CongressSession']==116]
c117rawdf = congdf.loc[congdf['CongressSession']==117]
c118rawdf = congdf.loc[congdf['CongressSession']==118]


In [9]:
#Create Blank Series for Session 109 Twitter Existence
c109twitdf=[]
for i in range(len(c109rawdf)):
    #Check if Creation Date is null
    if pd.isna(c109rawdf.iloc[i,12]):
        c109twitdf.append(0)
    else:
        #If date joined before end of session
        if c109rawdf.iloc[i,12]<c109rawdf.iloc[i,10]:
            #Add Success (1) to series
            c109twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c109twitdf.append(0)

In [10]:
#Check Distribution of Twitter Accounts
pd.Series(c109twitdf).value_counts()

0    444
dtype: int64

In [11]:
#Same Steps for Session 110
c110twitdf=[]
for i in range(len(c110rawdf)):
    #Check if Creation Date is null
    if pd.isna(c110rawdf.iloc[i,12]):
        c110twitdf.append(0)
    else:
        #If date joined before end of session
        if c110rawdf.iloc[i,12]<c110rawdf.iloc[i,10]:
            #Add Success (1) to series
            c110twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c110twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c110twitdf).value_counts()

0    390
1     55
dtype: int64

In [12]:
#Session 111
c111twitdf=[]
for i in range(len(c111rawdf)):
    #Check if Creation Date is null
    if pd.isna(c111rawdf.iloc[i,12]):
        c111twitdf.append(0)
    else:
        #If date joined before end of session
        if c111rawdf.iloc[i,12]<c111rawdf.iloc[i,10]:
            #Add Success (1) to series
            c111twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c111twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c111twitdf).value_counts()

1    245
0    202
dtype: int64

In [13]:
#Cleaning Training Data for 111
#Combining previous years
c111traindf = c109rawdf.append(c110rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c111traindf = c111traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c111traintwitdf=[]
for i in range(len(c111traindf)):
    #Check if Creation Date is null
    if pd.isna(c111traindf.iloc[i,12]):
        c111traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c111traindf.iloc[i,12]<c111traindf.iloc[i,10]:
            #Add Success
            c111traintwitdf.append(1)
            #Correct Age in training set
            c111traindf.iloc[i,8]=c111traindf.iloc[i,13]
        else:
            c111traintwitdf.append(0)

In [14]:
#Isolate Relevant Data for Training and Testing
c111train = c111traindf[['gender','party','AgeWhenSessionStarted']]
c111test = c111rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier111= LogisticRegression(max_iter = 10000)
#Train Model
classifier111.fit(c111train,c111traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier111.score(c111train,c111traintwitdf)}")
print(f"Testing Data Score: {classifier111.score(c111test,c111twitdf)}")
pd.Series(c111traintwitdf).value_counts()

Training Data Score: 0.892578125
Testing Data Score: 0.4519015659955257


0    457
1     55
dtype: int64

In [15]:
#Create Confusion Matrix for Session 111
true111 = c111twitdf
pred111 = classifier111.predict(c111test)
cm111 = confusion_matrix(true111,pred111)
#Turn into DataFrame
cm111df = pd.DataFrame(cm111,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm111df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,202,0
Actual 1,245,0


In [16]:
tn,fp,fn,tp = cm111df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true111,pred111))

Precision: nan
Sensitvity:0.0
              precision    recall  f1-score   support

           0       0.45      1.00      0.62       202
           1       0.00      0.00      0.00       245

    accuracy                           0.45       447
   macro avg       0.23      0.50      0.31       447
weighted avg       0.20      0.45      0.28       447



  This is separate from the ipykernel package so we can avoid doing imports until
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
#Tossing out Session 109
#Isolate Relevant Data for Training and Testing
c111train2 = c110rawdf[['gender','party','AgeWhenSessionStarted']]
#Test Data is already set from earlier
#Create new Logistic Regression
classifier111v2= LogisticRegression(max_iter = 10000)
#Train Model
classifier111v2.fit(c111train2,c110twitdf)
#Validate Model
print(f"Training Data Score: {classifier111v2.score(c111train2,c110twitdf)}")
print(f"Testing Data Score: {classifier111v2.score(c111test,c111twitdf)}")

Training Data Score: 0.8764044943820225
Testing Data Score: 0.4519015659955257


In [18]:
#Create Confusion Matrix for Session 111
true111v2 = c111twitdf
pred111v2 = classifier111.predict(c111test)
cm111v2 = confusion_matrix(true111v2,pred111v2)
#Turn into DataFrame
cm111v2df = pd.DataFrame(cm111,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm111v2df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,202,0
Actual 1,245,0


In [19]:
#Session 112
c112twitdf=[]
for i in range(len(c112rawdf)):
    #Check if Creation Date is null
    if pd.isna(c112rawdf.iloc[i,12]):
        c112twitdf.append(0)
    else:
        #If date joined before end of session
        if c112rawdf.iloc[i,12]<c112rawdf.iloc[i,10]:
            #Add Success (1) to series
            c112twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c112twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c112twitdf).value_counts()

1    354
0     92
dtype: int64

In [20]:
#Cleaning Training Data for 112
#Add Session 111 to training data
c112traindf = c111traindf.append(c111rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c112traindf = c112traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c112traintwitdf=[]
for i in range(len(c112traindf)):
    #Check if Creation Date is null
    if pd.isna(c112traindf.iloc[i,12]):
        c112traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c112traindf.iloc[i,12]<c112traindf.iloc[i,10]:
            #Add Success
            c112traintwitdf.append(1)
            #Correct Age in training set
            c112traindf.iloc[i,8]=c112traindf.iloc[i,13]
        else:
            c112traintwitdf.append(0)

In [21]:
#Isolate Relevant Data for Training and Testing
c112train = c112traindf[['gender','party','AgeWhenSessionStarted']]
c112test = c112rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier112= LogisticRegression(max_iter = 10000)
#Train Model
classifier112.fit(c112train,c112traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier112.score(c112train,c112traintwitdf)}")
print(f"Testing Data Score: {classifier112.score(c112test,c112twitdf)}")

Training Data Score: 0.5716753022452504
Testing Data Score: 0.21973094170403587


In [22]:
#Confusion Matrix
true112 = c112twitdf
pred112 = classifier112.predict(c112test)
cm112 = confusion_matrix(true112,pred112)
#Turn into DataFrame
cm112df = pd.DataFrame(cm112,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm112df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,91,1
Actual 1,347,7


In [23]:
tn,fp,fn,tp = cm112df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true112,pred112))

Precision: 0.875
Sensitvity:0.01977401129943503
              precision    recall  f1-score   support

           0       0.21      0.99      0.34        92
           1       0.88      0.02      0.04       354

    accuracy                           0.22       446
   macro avg       0.54      0.50      0.19       446
weighted avg       0.74      0.22      0.10       446



In [24]:
#Session 113
c113twitdf=[]
for i in range(len(c113rawdf)):
    #Check if Creation Date is null
    if pd.isna(c113rawdf.iloc[i,12]):
        c113twitdf.append(0)
    else:
        #If date joined before end of session
        if c113rawdf.iloc[i,12]<c113rawdf.iloc[i,10]:
            #Add Success (1) to series
            c113twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c113twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c113twitdf).value_counts()

1    397
0     48
dtype: int64

In [25]:
#Cleaning Training Data for 113
#Add Session 112 to training data
c113traindf = c112traindf.append(c112rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c113traindf = c113traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c113traintwitdf=[]
for i in range(len(c113traindf)):
    #Check if Creation Date is null
    if pd.isna(c113traindf.iloc[i,12]):
        c113traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c113traindf.iloc[i,12]<c113traindf.iloc[i,10]:
            #Add Success
            c113traintwitdf.append(1)
            #Correct Age in training set
            c113traindf.iloc[i,8]=c113traindf.iloc[i,13]
        else:
            c113traintwitdf.append(0)

In [26]:
#Isolate Relevant Data for Training and Testing
c113train = c113traindf[['gender','party','AgeWhenSessionStarted']]
c113test = c113rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier113= LogisticRegression(max_iter = 10000)
#Train Model
classifier113.fit(c113train,c113traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier113.score(c113train,c113traintwitdf)}")
print(f"Testing Data Score: {classifier113.score(c113test,c113twitdf)}")

Training Data Score: 0.5899705014749262
Testing Data Score: 0.8561797752808988


In [27]:
#Confusion Matrix
true113 = c113twitdf
pred113 = classifier113.predict(c113test)
cm113 = confusion_matrix(true113,pred113)
#Turn into DataFrame
cm113df = pd.DataFrame(cm113,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm113df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4,44
Actual 1,20,377


In [28]:
tn,fp,fn,tp = cm113df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true113,pred113))

Precision: 0.8954869358669834
Sensitvity:0.9496221662468514
              precision    recall  f1-score   support

           0       0.17      0.08      0.11        48
           1       0.90      0.95      0.92       397

    accuracy                           0.86       445
   macro avg       0.53      0.52      0.52       445
weighted avg       0.82      0.86      0.83       445



In [29]:
#Session 114
c114twitdf=[]
for i in range(len(c114rawdf)):
    #Check if Creation Date is null
    if pd.isna(c114rawdf.iloc[i,12]):
        c114twitdf.append(0)
    else:
        #If date joined before end of session
        if c114rawdf.iloc[i,12]<c114rawdf.iloc[i,10]:
            #Add Success (1) to series
            c114twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c114twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c114twitdf).value_counts()

1    410
0     34
dtype: int64

In [30]:
#Cleaning Training Data for 114
#Add Session 113 to training data
c114traindf = c113traindf.append(c113rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c114traindf = c114traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c114traintwitdf=[]
for i in range(len(c114traindf)):
    #Check if Creation Date is null
    if pd.isna(c114traindf.iloc[i,12]):
        c114traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c114traindf.iloc[i,12]<c114traindf.iloc[i,10]:
            #Add Success
            c114traintwitdf.append(1)
            #Correct Age in training set
            c114traindf.iloc[i,8]=c114traindf.iloc[i,13]
        else:
            c114traintwitdf.append(0)
#Isolate Relevant Data for Training and Testing
c114train = c114traindf[['gender','party','AgeWhenSessionStarted']]
c114test = c114rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier114= LogisticRegression(max_iter = 10000)
#Train Model
classifier114.fit(c114train,c114traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier114.score(c114train,c114traintwitdf)}")
print(f"Testing Data Score: {classifier114.score(c114test,c114twitdf)}")

Training Data Score: 0.6426701570680629
Testing Data Score: 0.8918918918918919


In [31]:
#Confusion Matrix
true114 = c114twitdf
pred114 = classifier114.predict(c114test)
cm114 = confusion_matrix(true114,pred114)
#Turn into DataFrame
cm114df = pd.DataFrame(cm114,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm114df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,32
Actual 1,16,394


In [32]:
tn,fp,fn,tp = cm114df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true114,pred114))

Precision: 0.9248826291079812
Sensitvity:0.9609756097560975
              precision    recall  f1-score   support

           0       0.11      0.06      0.08        34
           1       0.92      0.96      0.94       410

    accuracy                           0.89       444
   macro avg       0.52      0.51      0.51       444
weighted avg       0.86      0.89      0.88       444



In [33]:
#Session 115
c115twitdf=[]
for i in range(len(c115rawdf)):
    #Check if Creation Date is null
    if pd.isna(c115rawdf.iloc[i,12]):
        c115twitdf.append(0)
    else:
        #If date joined before end of session
        if c115rawdf.iloc[i,12]<c115rawdf.iloc[i,10]:
            #Add Success (1) to series
            c115twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c115twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c115twitdf).value_counts()

1    425
0     22
dtype: int64

In [34]:
#Cleaning Training Data for 115
#Add Session 114 to training data
c115traindf = c114traindf.append(c114rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c115traindf = c115traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c115traintwitdf=[]
for i in range(len(c115traindf)):
    #Check if Creation Date is null
    if pd.isna(c115traindf.iloc[i,12]):
        c115traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c115traindf.iloc[i,12]<c115traindf.iloc[i,10]:
            #Add Success
            c115traintwitdf.append(1)
            #Correct Age in training set
            c115traindf.iloc[i,8]=c115traindf.iloc[i,13]
        else:
            c115traintwitdf.append(0)
#Isolate Relevant Data for Training and Testing
c115train = c115traindf[['gender','party','AgeWhenSessionStarted']]
c115test = c115rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier115= LogisticRegression(max_iter = 10000)
#Train Model
classifier115.fit(c115train,c115traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier115.score(c115train,c115traintwitdf)}")
print(f"Testing Data Score: {classifier115.score(c115test,c115twitdf)}")

Training Data Score: 0.6751207729468599
Testing Data Score: 0.9328859060402684


In [35]:
#Confusion Matrix
true115 = c115twitdf
pred115 = classifier115.predict(c115test)
cm115 = confusion_matrix(true115,pred115)
#Turn into DataFrame
cm115df = pd.DataFrame(cm115,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm115df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,20
Actual 1,10,415


In [36]:
tn,fp,fn,tp = cm115df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true115,pred115))

Precision: 0.9540229885057471
Sensitvity:0.9764705882352941
              precision    recall  f1-score   support

           0       0.17      0.09      0.12        22
           1       0.95      0.98      0.97       425

    accuracy                           0.93       447
   macro avg       0.56      0.53      0.54       447
weighted avg       0.92      0.93      0.92       447



In [37]:
#Session 116
c116twitdf=[]
for i in range(len(c116rawdf)):
    #Check if Creation Date is null
    if pd.isna(c116rawdf.iloc[i,12]):
        c116twitdf.append(0)
    else:
        #If date joined before end of session
        if c116rawdf.iloc[i,12]<c116rawdf.iloc[i,10]:
            #Add Success (1) to series
            c116twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c116twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c116twitdf).value_counts()

1    432
0     11
dtype: int64

In [38]:
#Cleaning Training Data for 116
#Add Session 115 to training data
c116traindf = c115traindf.append(c115rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c116traindf = c116traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c116traintwitdf=[]
for i in range(len(c116traindf)):
    #Check if Creation Date is null
    if pd.isna(c116traindf.iloc[i,12]):
        c116traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c116traindf.iloc[i,12]<c116traindf.iloc[i,10]:
            #Add Success
            c116traintwitdf.append(1)
            #Correct Age in training set
            c116traindf.iloc[i,8]=c116traindf.iloc[i,13]
        else:
            c116traintwitdf.append(0)
#Isolate Relevant Data for Training and Testing
c116train = c116traindf[['gender','party','AgeWhenSessionStarted']]
c116test = c116rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier116= LogisticRegression(max_iter = 10000)
#Train Model
classifier116.fit(c116train,c116traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier116.score(c116train,c116traintwitdf)}")
print(f"Testing Data Score: {classifier116.score(c116test,c116twitdf)}")

Training Data Score: 0.7024608501118568
Testing Data Score: 0.9571106094808126


In [39]:
#Confusion Matrix
true116 = c116twitdf
pred116 = classifier116.predict(c116test)
cm116 = confusion_matrix(true116,pred116)
#Turn into DataFrame
cm116df = pd.DataFrame(cm116,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm116df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3,8
Actual 1,11,421


In [40]:
tn,fp,fn,tp = cm116df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true116,pred116))

Precision: 0.9813519813519813
Sensitvity:0.9745370370370371
              precision    recall  f1-score   support

           0       0.21      0.27      0.24        11
           1       0.98      0.97      0.98       432

    accuracy                           0.96       443
   macro avg       0.60      0.62      0.61       443
weighted avg       0.96      0.96      0.96       443



In [41]:
#Session 117
c117twitdf=[]
for i in range(len(c117rawdf)):
    #Check if Creation Date is null
    if pd.isna(c117rawdf.iloc[i,12]):
        c117twitdf.append(0)
    else:
        #If date joined before end of session
        if c117rawdf.iloc[i,12]<c117rawdf.iloc[i,10]:
            #Add Success (1) to series
            c117twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c117twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c117twitdf).value_counts()

1    443
0      6
dtype: int64

In [42]:
#Cleaning Training Data for 117
#Add Session 116 to training data
c117traindf = c116traindf.append(c116rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c117traindf = c117traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c117traintwitdf=[]
for i in range(len(c117traindf)):
    #Check if Creation Date is null
    if pd.isna(c117traindf.iloc[i,12]):
        c117traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c117traindf.iloc[i,12]<c117traindf.iloc[i,10]:
            #Add Success
            c117traintwitdf.append(1)
            #Correct Age in training set
            c117traindf.iloc[i,8]=c117traindf.iloc[i,13]
        else:
            c117traintwitdf.append(0)
#Isolate Relevant Data for Training and Testing
c117train = c117traindf[['gender','party','AgeWhenSessionStarted']]
c117test = c117rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier117= LogisticRegression(max_iter = 10000)
#Train Model
classifier117.fit(c117train,c117traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier117.score(c117train,c117traintwitdf)}")
print(f"Testing Data Score: {classifier117.score(c117test,c117twitdf)}")

Training Data Score: 0.7338056680161943
Testing Data Score: 0.9732739420935412


In [43]:
#Confusion Matrix
true117 = c117twitdf
pred117 = classifier117.predict(c117test)
cm117 = confusion_matrix(true117,pred117)
#Turn into DataFrame
cm117df = pd.DataFrame(cm117,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm117df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,4
Actual 1,8,435


In [44]:
tn,fp,fn,tp = cm117df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true117,pred117))

Precision: 0.9908883826879271
Sensitvity:0.981941309255079
              precision    recall  f1-score   support

           0       0.20      0.33      0.25         6
           1       0.99      0.98      0.99       443

    accuracy                           0.97       449
   macro avg       0.60      0.66      0.62       449
weighted avg       0.98      0.97      0.98       449



In [45]:
#Session 118
c118twitdf=[]
for i in range(len(c118rawdf)):
    #Check if Creation Date is null
    if pd.isna(c118rawdf.iloc[i,12]):
        c118twitdf.append(0)
    else:
        #If date joined before end of session
        if c118rawdf.iloc[i,12]<c118rawdf.iloc[i,10]:
            #Add Success (1) to series
            c118twitdf.append(1)
        #Join date is after end of session, meaning twitter account didn't exist
        else:
            c118twitdf.append(0)
#Check Distribution of Twitter Accounts
pd.Series(c118twitdf).value_counts()

1    437
0      3
dtype: int64

In [46]:
#Cleaning Training Data for 118
#Add Session 117 to training data
c118traindf = c117traindf.append(c117rawdf, ignore_index=True)
#Drop duplicates, keeping most recent entries
c118traindf = c118traindf.drop_duplicates(subset=['uniqueid'],keep='last')
#Create corresponding success series
c118traintwitdf=[]
for i in range(len(c118traindf)):
    #Check if Creation Date is null
    if pd.isna(c118traindf.iloc[i,12]):
        c118traintwitdf.append(0)
    else:
        #If date joined before end of session
        if c118traindf.iloc[i,12]<c118traindf.iloc[i,10]:
            #Add Success
            c118traintwitdf.append(1)
            #Correct Age in training set
            c118traindf.iloc[i,8]=c118traindf.iloc[i,13]
        else:
            c118traintwitdf.append(0)
#Isolate Relevant Data for Training and Testing
c118train = c118traindf[['gender','party','AgeWhenSessionStarted']]
c118test = c118rawdf[['gender','party','AgeWhenSessionStarted']]
#Create Logistic Regression Model
classifier118= LogisticRegression(max_iter = 10000)
#Train Model
classifier118.fit(c118train,c118traintwitdf)
#Validate Model
print(f"Training Data Score: {classifier118.score(c118train,c118traintwitdf)}")
print(f"Testing Data Score: {classifier118.score(c118test,c118twitdf)}")

Training Data Score: 0.7509433962264151
Testing Data Score: 0.9772727272727273


In [47]:
#Confusion Matrix
true118 = c118twitdf
pred118 = classifier118.predict(c118test)
cm118 = confusion_matrix(true118,pred118)
#Turn into DataFrame
cm118df = pd.DataFrame(cm118,index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm118df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,1
Actual 1,9,428


In [48]:
tn,fp,fn,tp = cm118df.values.ravel()
#Precision
print(f"Precision: {tp/(tp+fp)}")
#Sensitivity
print(f"Sensitvity:{tp/(tp+fn)}")
#F1
print(classification_report(true118,pred118))

Precision: 0.9976689976689976
Sensitvity:0.9794050343249427
              precision    recall  f1-score   support

           0       0.18      0.67      0.29         3
           1       1.00      0.98      0.99       437

    accuracy                           0.98       440
   macro avg       0.59      0.82      0.64       440
weighted avg       0.99      0.98      0.98       440

