In [73]:
import transformers
import datasets
import shap
from shap.plots import *
import csv
import nltk
import re, string
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import datetime
import torch
import random
from PIL import Image


In [74]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\'','', text)
    
    return text

In [118]:
# convert date from string to datetime object
def datte():
    for i in range(len(train)):
        train['date'][i] = datetime.datetime.strptime(train['date'][i], '%Y-%m-%d %H:%M:%S')
        
# select n random tweets from start date to end date
def choose_tweets_from_date(start, end, n):
    start = datetime.datetime.strptime(start, '%Y-%m-%d')
    end = datetime.datetime.strptime(end, '%Y-%m-%d')
    choices = []
    for i in range(len(train)):
        if start <= train['date'][i] <= end:
            el = []
            el.append([train['content'][i], i])
            choices.append(el)
        
    
    chosen = []
    chosen.append(random.sample(choices, n))

    return chosen

In [119]:
train = pd.read_csv('WFiIS-MIO-main/datasets/realdonaldtrump.csv')
train = train.drop(['link', 'retweets', 'favorites', 'mentions', 'hashtags'], axis=1)

train['content'] = train['content'].map(lambda x: re.sub(r'@\w+\s', ' ', x))
train['content'] = train['content'].map(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))
train['content'] = train['content'].map(lambda x: re.sub(r'\W+', ' ', x))
train['content'] = train['content'].replace(r'\W+', ' ', regex=True)
train['content'] = train['content'].apply(lambda x:clean_text(x))
train['token'] = train['content'].apply(lambda x:word_tokenize(x))
train['text'] = train['token'].apply(lambda x: ' '.join([word for word in x if len(word)>2]))
datte()


In [120]:
n = 50
before = choose_tweets_from_date('2011-01-01', '2017-01-19', n)
after = choose_tweets_from_date('2017-01-20', '2021-01-20', n)
before[0]

[[{' jheil at nymag is such a pathetic reporter who doesn t want to know the truth a total obama flunky hack ': 7332}],
 [{'  donald trump leads in new gop polls foxandfriends realdonaldtrump pic twitter com  ': 24310}],
 [{' ianjamespoulter great going and almost as importantly your clothing line is selling well ': 17509}],
 [{'they laughed at me when i said to bomb the isis controlled oil fields now they are not laughing and doing what i said ': 26232}],
 [{' makeamericagreatagain  twitter com ': 27028}],
 [{'i had a great time in texas yesterday a tremendous crowd of wonderful and enthusiastic people will be back soon ': 26260}],
 [{' mccareydanny realdonaldtrump how come nbc is screwing theapprentice by burning off episodes so fast only one hour each ': 19853}],
 [{'with all that is happening with ebola including the doctor who so easily came back to new york obama still refuses to stop the flights ': 18032}],
 [{'i will be interviewed by jdickerson on facethenation tomorrow mornin

In [78]:
top = Counter([item for sublist in train['token'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')


Unnamed: 0,Common_words,count
0,the,34657
1,to,19801
2,and,15928
3,a,15491
4,of,13345
5,is,12886
6,in,11942
7,i,10795
8,you,10716
9,for,9991


In [79]:
fig = px.bar(temp, x="count", y="Common_words", title='Common Words in Selected Text', orientation='h', 
          width=700, height=700,color='Common_words')
fig.show()

In [121]:

classifier = transformers.pipeline('sentiment-analysis', return_all_scores=True)
classifier(before[0])

pmodel = shap.models.TransformersPipeline(classifier, rescale_to_logits=True) 

explainer2 = shap.Explainer(pmodel)
shap_values = explainer2(before[0])
shap.plots.text(shap_values[:,:,1])

for i in range(n):
    shap.plots.bar(shap_values[i, :,"POSITIVE"])
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0))
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0), order=shap.Explanation.argsort)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [89]:

classifier = transformers.pipeline('sentiment-analysis', return_all_scores=True)
classifier(after[0])

pmodel = shap.models.TransformersPipeline(classifier, rescale_to_logits=True)

explainer2 = shap.Explainer(pmodel)
shap_values = explainer2(after[0])
shap.plots.text(shap_values[:,:,1])

for i in range(n):
    shap.plots.bar(shap_values[i, :,"POSITIVE"])
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0))
shap.plots.bar(shap_values[:, :, "POSITIVE"].mean(0), order=shap.Explanation.argsort)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


TypeError: TextInputSequence must be str

In [66]:
average = sum(shap_values[:, :, "POSITIVE"].mean(0).values)/len(shap_values[:, :, "POSITIVE"].mean(0).values)
print(average)

-0.04369382860839609


In [84]:
negative, positive = np.zeros((train.shape[0], 1)), np.zeros((train.shape[0], 1))

In [90]:
for i, tweet in enumerate(after[0]):
    output = classifier(*tweet)
    score = output[0][0].detach().numpy()
    scores = softmax(score)
    negative[i] = scores[0]
    positive[i] = scores[2]

train['Negative'] = negative
train['Positive'] = positive

TypeError: <transformers.pipelines.text_classification.TextClassificationPipeline object at 0x00000186BB843D30> argument after ** must be a mapping, not list