In [1]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from collections import Counter
from sklearn.metrics import f1_score, precision_score, recall_score

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)

RandomState(MT19937) at 0x7FB8E1891840

In [None]:
df = pd.read_csv('predictions.tsv', sep='\t')
df.shape

In [None]:
df

In [None]:
df['error'] = df.apply(lambda x: 'right' if x['pred']==x['group'] else 'wrong', axis=1)
df['tok_len'] = df['tweet_clean'].apply(lambda x: len(x))
wrong = df[df['group']!=df['pred']]
right = df[df['group']==df['pred']]
wrong.shape, right.shape

Are the wrong tweets more likely to be longer? Looks like no

In [None]:
sns.displot(df, x='tok_len', hue='error', kind='kde', common_norm=False, height=10)

Model performs well on joy and admiration, but not on disgust

In [None]:
df[(df['Disgust']==True) & (df['error']=='right')].shape, df[(df['Disgust']==True) & (df['error']=='wrong')].shape

In [None]:
df[(df['Sadness']==True) & (df['error']=='right')].shape, df[(df['Sadness']==True) & (df['error']=='wrong')].shape

In [None]:
df[(df['Disgust']==True) & (df['error']=='wrong')]

In [None]:
df[(df['Sadness']==True) & (df['error']=='wrong')]

In [None]:
df[(df['Joy']==True) & (df['error']=='wrong')]

In [None]:
df[(df['Admiration']==True) & (df['error']=='wrong')]

## Getting the dataframe with all mention tweets and making it bigger

In [2]:
mdf = pd.read_csv('../../../../lib-data/congressional-tweets-main/all-mention-tweets.tsv', sep='\t', engine='python').loc[:, ['id', 'tweet', 'username', 'mentname', 'group', 'party']]
mdf.dropna(inplace=True)
mdf['group'] = mdf['group'].astype(int)
mdf

Unnamed: 0,id,tweet,username,mentname,group,party
0,238056105207554048,"""...to fight for their state like this."" @Randy_Forbes #AR3 #Arkansas",rep_stevewomack,randy_forbes,1,R
1,227768354516131841,Hearing on #MEA is beginning now in House Judiciary Committee #AR3 #Arkansas @RepSpeier,rep_stevewomack,repspeier,-1,R
2,225031228871557120,Thanks 2 everyone who participated in tonight's special Tele-Town Hall w/ @RepTomPrice as we discussed the #healthcare bill. #AR3 #Arkansas,rep_stevewomack,reptomprice,1,R
3,224921430259810306,Tele-Town Hall 2Nite 7PM CT w/ guest @RepTomPrice - Call-in # (877) 229-8493 &amp; passcode 17688 #AR3 #Arkansas,rep_stevewomack,reptomprice,1,R
4,165124042255380481,I am honored to be appointed by @SpeakerBoehner to the US #Military Academy Board of Visitors #AR03,rep_stevewomack,speakerboehner,1,R
...,...,...,...,...,...,...
73180,1387606571824336896,I know I'm not the only one who was moved by @SenatorTimScott's depiction of all the promise &amp; opportunity here in America. We both lived out our American Dream - and we want all the kids across this great nation to find theirs too.,repkevinhern,senatortimscott,1,R
73181,1383077783954460686,Glad to hear it! I’d love to work w/ you on @RepJasonSmith’s HR 1712 to repeal the estate tax (aka the death tax). It’s not fair &amp; it’s not just. The last thing a grieving son or daughter needs is the burden of heavy taxes on their parent’s assets - that have already been taxed!,repkevinhern,repjasonsmith,1,R
73182,1359936343116288002,"Today, Democrats rejected two amendments from @RepArrington &amp; myself, choosing to spend your taxpayer dollars on subsidized healthcare for people in our country illegally while voting against relief for the American energy workers who are unemployed because of Biden’s policies.",repkevinhern,reparrington,1,R
73183,1345829663076179968,"Pelosi’s lack of leadership cost millions of Americans their jobs this year. We’ll be recovering from the damage she caused for years. We need a common-sense leader in the House, which is why I voted for @GOPLeader just now.",repkevinhern,gopleader,1,R


In [3]:
Counter(mdf['group'])

Counter({1: 51105, -1: 22076})

In [4]:
two_df = pd.concat([mdf[mdf['group']==1].sample(3000), mdf[mdf['group']==-1].sample(3000)]).reset_index(drop=True)

three_df = pd.concat([mdf[mdf['group']==1].sample(4500), mdf[mdf['group']==-1].sample(4500)]).reset_index(drop=True)

In [5]:
def train_valid_test():
    t = random.random()
    if t < 0.8:
        return 'train'
    else:
        h = random.random()
        if h < 0.5:
            return 'dev'
        else:
            return 'test'

two_df['Split'] = two_df.apply(lambda x: train_valid_test(), axis=1)
three_df['Split'] = three_df.apply(lambda x: train_valid_test(), axis=1)

two_df.to_csv('two_data.tsv', sep='\t', index=False)
three_df.to_csv('three_data.tsv', sep='\t', index=False)