In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [93]:
import spacy
import pandas as pd
import ast
import matplotlib.pyplot as plt
import json


nlp = spacy.load("en_core_web_sm")

def count_tokens(txt):
  return len(nlp(txt))

def get_dataset(path_to_data):
  with open(path_to_data) as f:
    data = json.loads(f.read())
  data = [[x['description'], 1 if x['label']=='bot' else 0] for x in data]
  df = pd.DataFrame(data)
  df.columns = ["text", "labels"]
  return df

train_df = get_dataset('/content/drive/MyDrive/advNLP/twibot20/train.json')
test_explanations_df = pd.read_csv('/content/drive/MyDrive/advNLP/twibot20/test_explanations.csv', index_col=0)
test_explanations_df.weights = test_explanations_df.weights.apply(lambda x: ast.literal_eval(x))

test_explanations_df['token_count'] = test_explanations_df['text'].apply(count_tokens)
test_explanations_df['max_elem'] = test_explanations_df['weights'].apply(lambda l : max(l, key=lambda x: abs(x[1])) if len(l)>0 else ('',0))
test_explanations_df['max_elem_weight'] = test_explanations_df['weights'].apply(lambda l : abs(max(l, key=lambda x: abs(x[1]))[1]) if len(l)>0 else 0)

In [122]:
all_weights = [item for sublist in test_explanations_df.weights.values for item in sublist]
all_weights.sort(key=lambda x: -abs(x[1]))

In [133]:
for token, weight in all_weights[:10]:
  train_bot_pct = train_df[train_df.text.apply(lambda x : (token in x))].labels.mean()
  print(token, weight, train_bot_pct)

actor -0.814881458705768 0.4230769230769231
Rihanna -0.7891745848179716 nan
Comms 0.7848082280378158 0.5333333333333333
EPM -0.7836205359652342 0.0
Committees -0.7796661190152939 1.0
trump 0.7664076624856749 0.7857142857142857
player -0.7471178447017364 0.47058823529411764
Player -0.7380681953800026 0.2413793103448276
Jesus 0.6891305830033874 0.6382978723404256
Host -0.6880679583576949 0.20270270270270271


In [222]:
train_bot_pct = train_df[train_df.text.apply(lambda x : ("Stuart" in x))].labels.mean()
print(train_bot_pct)

nan


In [177]:
misclassified_df = test_explanations_df[test_explanations_df.misclassified==True].sort_values(by='bot_prob', ascending=False)

In [215]:
misclassified_df.iloc[80].weights

[('aspiring', 0.17220007641469315),
 ('author', -0.14323704145736627),
 ('Liz', -0.10643629563713967),
 ('her', 0.10518998474824658),
 ('they', 0.09534336933253146),
 ('editor', -0.09226288870194117)]

In [189]:
for idx in range(30):
  print(misclassified_df.iloc[idx].text, misclassified_df.iloc[idx].bot_prob, misclassified_df.iloc[idx].weights)

Embattled MYsElf for the Worst Very Common to me And you Should also Comply ...Ain't it ???  0.993384607321672 [('Comply', 0.0992785356634549), ('MYsElf', 0.0646187571067114), ('also', 0.05748827598824706), ('Common', 0.0492908101038694), ('Very', 0.037458311374991136), ('Should', 0.03431343454132944)]
Fishing Guide at Big Bite Charters. From the Nature Coast, FL to Lake Lanier, GA.  0.9932941520716714 [('Charters', 0.008091374453649007), ('Fishing', 0.006523069974851756), ('Nature', 0.00648039698969954), ('Guide', 0.006399240508921272), ('Coast', 0.005614362963105646), ('Lake', 0.004863703314708101)]
San Diego realtor who loves to read. Tesla M3 | Books | DRE02061882 Douglas Elliman  0.9930693283432538 [('DRE02061882', 0.046406909727895115), ('realtor', 0.040106325815349154), ('San', 0.03837342378173318), ('loves', 0.033149064444653875), ('Diego', 0.03284224412277421), ('who', 0.02937728127745193)]
NBA basketball news site INSIDE HOOPS! @NBA news, rumors and info on the playoffs, draf

In [188]:
for idx in range(20):
  print(misclassified_df.iloc[-1 - idx].text, misclassified_df.iloc[-1 - idx].bot_prob,  misclassified_df.iloc[-1 - idx].weights)

Spokesperson in the @DNCWarRoom | @hay_rach is my forever best friend. These tweets are my own. he/him/his  0.0066669319312806 [('Spokesperson', -0.6117393689327363), ('in', -0.10760933942520329), ('the', -0.0729043440153854), ('is', 0.051017809375973826), ('friend', -0.03150676734592848), ('his', -0.027144475608207094)]
Stories from the sidelines. Your guide to #UCLA sports from the @dailybruin Sports section.  0.007232763803479 [('UCLA', -0.027384362528522336), ('from', -0.024853495164572515), ('Stories', -0.018452423218349806), ('sports', -0.016214452672731035), ('sidelines', -0.014196990337893586), ('the', -0.013442916666093487)]
New Jersey reporter for @nytimes. tracey.tully@nytimes.com  0.0074030140888726 [('com', -0.21734626242104968), ('reporter', -0.20980004399444488), ('Jersey', -0.061628528764579034), ('tully', 0.061075916078974286), ('for', -0.044040909257139986), ('New', 0.03116220469294539)]
Arkansas State assistant QB coach #family UNC-Chapel Hill alum  0.007489624898808