**PROJECT: FAKE NEWS FILTER**

**1. Prerequisites**

In [1]:
!wget https://proai-datasets.s3.eu-west-3.amazonaws.com/fake_news.zip
!unzip fake_news.zip

--2024-08-24 07:40:30--  https://proai-datasets.s3.eu-west-3.amazonaws.com/fake_news.zip
Resolving proai-datasets.s3.eu-west-3.amazonaws.com (proai-datasets.s3.eu-west-3.amazonaws.com)... 3.5.205.182, 16.12.19.18
Connecting to proai-datasets.s3.eu-west-3.amazonaws.com (proai-datasets.s3.eu-west-3.amazonaws.com)|3.5.205.182|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42975911 (41M) [application/zip]
Saving to: ‘fake_news.zip’


2024-08-24 07:40:32 (25.3 MB/s) - ‘fake_news.zip’ saved [42975911/42975911]

Archive:  fake_news.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
from collections import Counter
import numpy as np

In [4]:
df_true = pd.read_csv("True.csv")
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
df_fake = pd.read_csv("Fake.csv")
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


**2. Implementation of Functions**

In [6]:
df_true['source'] = 'true'
df_fake['source'] = 'fake'
df_true_subset = df_true[['title', 'text', 'source']]
df_fake_subset = df_fake[['title', 'text', 'source']]
df_news = pd.concat([df_true_subset, df_fake_subset], ignore_index=True)
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   44898 non-null  object
 1   text    44898 non-null  object
 2   source  44898 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [7]:
nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')
punctuation=set(string.punctuation)

def data_cleaner(dataset):

    def filter_tokens(text):
        doc = nlp(text)
        filtered_tokens = [token.text for token in doc if token.pos_ not in ['PRON', 'VERB', 'ADV', 'AUX', 'ADP']]
        return ' '.join(filtered_tokens)

    dataset_to_return = []
    for sentence in dataset:
        sentence = sentence.lower()
        sentence = ''.join([char for char in sentence if char not in string.punctuation])
        sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
        sentence = re.sub('\d', '', sentence)
        sentence = ' '.join(word for word in sentence.split() if len(word) > 3)
        sentence = filter_tokens(sentence)
        dataset_to_return.append(sentence)

    return dataset_to_return


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
vectorizer=TfidfVectorizer()
def bow_tfidf(dataset, vectorizer):
  if vectorizer==None:
    vectorizer=TfidfVectorizer()
    X=vectorizer.fit_transform(dataset)
  else:
    X=vectorizer.transform(dataset)
  return X.toarray(), vectorizer

In [9]:
news_text_cleaned = data_cleaner(df_news['text'])

In [10]:
with open('news_text_cleaned.pkl', 'wb') as f:
    pickle.dump(news_text_cleaned, f)

In [11]:
with open('news_text_cleaned.pkl', 'rb') as f:
    news_text_cleaned = pickle.load(f)

In [12]:
df_news['text_cleaned']=news_text_cleaned

In [13]:
df_1,df_2=train_test_split(df_news,test_size=0.30,random_state=69)
df_train,df_test=train_test_split(df_1,test_size=0.25,random_state=69)

In [14]:
train_news_cleaned,vectorized=bow_tfidf(df_train['text_cleaned'], None)
train_news_cleaned

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
len(df_train[df_train['source']=='fake'])

12396

In [16]:
len(df_train[df_train['source']=='true'])

11175

In [17]:
clf= MLPClassifier(activation='logistic',
                   solver='adam',
                   max_iter=50,
                   hidden_layer_sizes=(5),
                   tol=0.01,
                   verbose=True
                   )

In [18]:
clf.fit(train_news_cleaned,df_train['source'])

Iteration 1, loss = 0.68275715
Iteration 2, loss = 0.64540427
Iteration 3, loss = 0.57945282
Iteration 4, loss = 0.49678697
Iteration 5, loss = 0.41960451
Iteration 6, loss = 0.35592678
Iteration 7, loss = 0.30508798
Iteration 8, loss = 0.26447844
Iteration 9, loss = 0.23158807
Iteration 10, loss = 0.20456770
Iteration 11, loss = 0.18206602
Iteration 12, loss = 0.16310183
Iteration 13, loss = 0.14687521
Iteration 14, loss = 0.13296115
Iteration 15, loss = 0.12083396
Iteration 16, loss = 0.11028910
Iteration 17, loss = 0.10099213
Iteration 18, loss = 0.09277674
Iteration 19, loss = 0.08550158
Iteration 20, loss = 0.07900598
Iteration 21, loss = 0.07320884
Iteration 22, loss = 0.06799444
Iteration 23, loss = 0.06329180
Iteration 24, loss = 0.05903936
Iteration 25, loss = 0.05520108
Iteration 26, loss = 0.05170468
Iteration 27, loss = 0.04852538
Training loss did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.


In [19]:
with open('fake_news_filter.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [20]:
with open('fake_news_filter.pkl', 'rb') as f:
    clf = pickle.load(f)

In [21]:
test_news_cleaned,vectorized=bow_tfidf(df_test['text_cleaned'], vectorized)
test_news_cleaned

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
clf.score(test_news_cleaned,df_test['source'])

0.9753086419753086

In [23]:
X_vectorized = vectorizer.fit_transform(df_news['text_cleaned'])
cross_val_scores = cross_val_score(clf, X_vectorized, df_news['source'], cv=3, scoring='accuracy')

Iteration 1, loss = 0.67388324
Iteration 2, loss = 0.60634936
Iteration 3, loss = 0.50987693
Iteration 4, loss = 0.41375786
Iteration 5, loss = 0.33607729
Iteration 6, loss = 0.27715639
Iteration 7, loss = 0.23259802
Iteration 8, loss = 0.19830472
Iteration 9, loss = 0.17132264
Iteration 10, loss = 0.14969870
Iteration 11, loss = 0.13206617
Iteration 12, loss = 0.11743660
Iteration 13, loss = 0.10513838
Iteration 14, loss = 0.09468635
Iteration 15, loss = 0.08570232
Iteration 16, loss = 0.07790201
Iteration 17, loss = 0.07110377
Iteration 18, loss = 0.06512025
Iteration 19, loss = 0.05983961
Iteration 20, loss = 0.05513984
Iteration 21, loss = 0.05094637
Iteration 22, loss = 0.04720480
Iteration 23, loss = 0.04384096
Iteration 24, loss = 0.04082864
Iteration 25, loss = 0.03811295
Training loss did not improve more than tol=0.010000 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.68052288
Iteration 2, loss = 0.63685302
Iteration 3, loss = 0.56175592
Iteration 4, loss = 0.4764

In [24]:
with open('cross_val_scores.pkl', 'wb') as f:
    pickle.dump(cross_val_scores, f)

In [25]:
with open('cross_val_scores.pkl', 'rb') as f:
    cross_val_scores = pickle.load(f)

In [26]:
cross_val_scores

array([0.9354537 , 0.92529734, 0.96625685])

In [27]:
cross_val_scores.mean()

0.9423359615127622

3. **Dataset Analysis: Fake Dataset**

In [28]:
df_fake['subject'].value_counts()

Unnamed: 0_level_0,count
subject,Unnamed: 1_level_1
News,9050
politics,6841
left-news,4459
Government News,1570
US_News,783
Middle-east,778


In [29]:
fake_text_cleaned=data_cleaner(df_fake['text'])

In [30]:
with open('fake_text_cleaned.pkl', 'wb') as f:
    pickle.dump(fake_text_cleaned, f)

In [31]:
with open('fake_text_cleaned.pkl', 'rb') as f:
    fake_text_cleaned = pickle.load(f)

In [32]:
df_fake['text_cleaned']=fake_text_cleaned

In [33]:
def sent_to_words(items):
  for item in items:
    yield(simple_preprocess(item,deacc=True))

In [34]:
def topic_modeling(subject, num_topics=5, passes=10):
    data_words = list(sent_to_words(subject))

    id2word = corpora.Dictionary(data_words)

    corpus = [id2word.doc2bow(text) for text in data_words]

    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        passes=passes
    )

    for topic_num in range(num_topics):
        pprint(lda_model.print_topic(topic_num))

    fake_lda = lda_model[corpus]

    return lda_model, fake_lda

In [35]:
news_fake = df_fake[df_fake['subject'] == 'News']['text_cleaned']
politics_fake = df_fake[df_fake['subject'] == 'politics']['text_cleaned']
left_fake = df_fake[df_fake['subject'] == 'left-news']['text_cleaned']
government_fake = df_fake[df_fake['subject'] == 'Government News']['text_cleaned']
us_fake = df_fake[df_fake['subject'] == 'US_News']['text_cleaned']
middleeast_fake = df_fake[df_fake['subject'] == 'Middle-east']['text_cleaned']

In [36]:
topic_modeling(news_fake)

('0.012*"people" + 0.008*"republicans" + 0.007*"state" + 0.006*"image" + '
 '0.005*"bill" + 0.005*"republican" + 0.004*"states" + 0.004*"trump" + '
 '0.004*"time" + 0.004*"obama"')
('0.022*"trump" + 0.011*"president" + 0.009*"obama" + 0.008*"russia" + '
 '0.007*"people" + 0.006*"donald" + 0.006*"realdonaldtrump" + 0.006*"image" + '
 '0.005*"january" + 0.004*"house"')
('0.020*"trump" + 0.015*"white" + 0.009*"people" + 0.007*"black" + '
 '0.006*"president" + 0.006*"image" + 0.005*"donald" + 0.005*"house" + '
 '0.004*"racist" + 0.004*"women"')
('0.009*"people" + 0.007*"president" + 0.007*"trump" + 0.006*"court" + '
 '0.006*"image" + 0.004*"water" + 0.004*"supreme" + 0.003*"time" + '
 '0.003*"women" + 0.003*"republicans"')
('0.059*"trump" + 0.014*"donald" + 0.009*"people" + 0.009*"president" + '
 '0.007*"clinton" + 0.007*"image" + 0.007*"campaign" + 0.006*"hillary" + '
 '0.006*"republican" + 0.005*"time"')


(<gensim.models.ldamulticore.LdaMulticore at 0x7f1c1c0195d0>,
 <gensim.interfaces.TransformedCorpus at 0x7f1c1aa7ca90>)

In [37]:
topic_modeling(politics_fake)

('0.013*"trump" + 0.010*"people" + 0.009*"president" + 0.007*"black" + '
 '0.007*"obama" + 0.006*"police" + 0.005*"white" + 0.004*"america" + '
 '0.004*"country" + 0.003*"american"')
('0.009*"clinton" + 0.005*"trump" + 0.005*"state" + 0.004*"people" + '
 '0.004*"hillary" + 0.004*"president" + 0.004*"obama" + 0.004*"news" + '
 '0.004*"group" + 0.003*"america"')
('0.005*"illegal" + 0.005*"immigration" + 0.005*"people" + 0.005*"border" + '
 '0.005*"obama" + 0.004*"federal" + 0.004*"state" + 0.004*"states" + '
 '0.004*"police" + 0.004*"president"')
('0.019*"clinton" + 0.015*"trump" + 0.011*"hillary" + 0.007*"state" + '
 '0.007*"president" + 0.007*"campaign" + 0.005*"former" + 0.005*"election" + '
 '0.005*"people" + 0.004*"house"')
('0.019*"trump" + 0.012*"president" + 0.008*"obama" + 0.006*"hillary" + '
 '0.006*"news" + 0.006*"people" + 0.006*"house" + 0.005*"donald" + '
 '0.004*"white" + 0.004*"media"')


(<gensim.models.ldamulticore.LdaMulticore at 0x7f1c1aea9540>,
 <gensim.interfaces.TransformedCorpus at 0x7f1c16069de0>)

In [38]:
topic_modeling(left_fake)

('0.010*"obama" + 0.008*"trump" + 0.007*"students" + 0.007*"president" + '
 '0.006*"white" + 0.006*"people" + 0.004*"university" + 0.004*"school" + '
 '0.003*"state" + 0.003*"black"')
('0.006*"hillary" + 0.006*"trump" + 0.006*"president" + 0.005*"people" + '
 '0.005*"black" + 0.005*"clinton" + 0.004*"white" + 0.004*"time" + '
 '0.003*"obama" + 0.003*"state"')
('0.007*"president" + 0.006*"obama" + 0.006*"people" + 0.005*"trump" + '
 '0.005*"government" + 0.005*"state" + 0.005*"states" + 0.004*"united" + '
 '0.004*"american" + 0.004*"federal"')
('0.021*"trump" + 0.012*"clinton" + 0.010*"hillary" + 0.008*"president" + '
 '0.007*"people" + 0.006*"news" + 0.005*"donald" + 0.005*"media" + '
 '0.004*"campaign" + 0.004*"white"')
('0.019*"police" + 0.006*"officers" + 0.006*"people" + 0.005*"officer" + '
 '0.005*"black" + 0.004*"city" + 0.004*"county" + 0.004*"news" + '
 '0.004*"school" + 0.003*"video"')


(<gensim.models.ldamulticore.LdaMulticore at 0x7f1c1ca6ebf0>,
 <gensim.interfaces.TransformedCorpus at 0x7f1c1606bca0>)

In [39]:
  topic_modeling(government_fake)

('0.006*"state" + 0.006*"people" + 0.005*"federal" + 0.004*"states" + '
 '0.004*"government" + 0.004*"food" + 0.004*"year" + 0.004*"obama" + '
 '0.004*"million" + 0.003*"years"')
('0.015*"clinton" + 0.008*"state" + 0.008*"department" + 0.007*"iran" + '
 '0.007*"obama" + 0.006*"hillary" + 0.006*"president" + 0.005*"security" + '
 '0.005*"email" + 0.005*"trump"')
('0.016*"obama" + 0.011*"president" + 0.006*"government" + 0.005*"people" + '
 '0.004*"trump" + 0.004*"house" + 0.004*"american" + 0.004*"first" + '
 '0.004*"state" + 0.004*"senate"')
('0.006*"court" + 0.005*"people" + 0.005*"police" + 0.004*"obama" + '
 '0.004*"state" + 0.004*"president" + 0.004*"government" + 0.004*"house" + '
 '0.004*"federal" + 0.003*"muslim"')
('0.007*"people" + 0.005*"trump" + 0.005*"united" + 0.004*"president" + '
 '0.004*"obama" + 0.004*"country" + 0.004*"border" + 0.004*"time" + '
 '0.004*"states" + 0.004*"state"')


(<gensim.models.ldamulticore.LdaMulticore at 0x7f1c1b0525c0>,
 <gensim.interfaces.TransformedCorpus at 0x7f1c1606abf0>)

In [40]:
topic_modeling(us_fake)

('0.007*"trump" + 0.005*"wire" + 0.005*"president" + 0.005*"century" + '
 '0.005*"people" + 0.004*"world" + 0.004*"news" + 0.003*"israel" + '
 '0.003*"clinton" + 0.003*"political"')
('0.011*"media" + 0.009*"news" + 0.007*"story" + 0.006*"wire" + 0.005*"trump" '
 '+ 0.004*"political" + 0.004*"fake" + 0.004*"century" + 0.004*"shooter" + '
 '0.004*"many"')
('0.009*"trump" + 0.008*"syria" + 0.008*"clinton" + 0.007*"russia" + '
 '0.007*"media" + 0.006*"government" + 0.006*"state" + 0.006*"wire" + '
 '0.006*"news" + 0.005*"russian"')
('0.011*"wire" + 0.007*"trump" + 0.006*"syria" + 0.005*"century" + '
 '0.005*"news" + 0.004*"world" + 0.004*"week" + 0.004*"patrick" + '
 '0.004*"political" + 0.004*"president"')
('0.010*"room" + 0.010*"boiler" + 0.008*"trump" + 0.007*"radio" + 0.007*"wire" '
 '+ 0.006*"broadcast" + 0.005*"political" + 0.005*"media" + 0.004*"another" + '
 '0.004*"current"')


(<gensim.models.ldamulticore.LdaMulticore at 0x7f1d1f69b880>,
 <gensim.interfaces.TransformedCorpus at 0x7f1c1ab32800>)

In [41]:
topic_modeling(middleeast_fake)

('0.008*"media" + 0.007*"room" + 0.007*"wire" + 0.007*"boiler" + '
 '0.006*"political" + 0.006*"news" + 0.006*"radio" + 0.005*"story" + '
 '0.004*"another" + 0.004*"episode"')
('0.013*"trump" + 0.011*"news" + 0.010*"media" + 0.009*"wire" + '
 '0.007*"clinton" + 0.006*"century" + 0.006*"election" + 0.005*"political" + '
 '0.005*"hillary" + 0.005*"fake"')
('0.009*"media" + 0.006*"story" + 0.005*"wire" + 0.005*"shooter" + '
 '0.004*"news" + 0.004*"police" + 0.004*"federal" + 0.004*"century" + '
 '0.004*"event" + 0.004*"shooting"')
('0.010*"syria" + 0.007*"government" + 0.006*"wire" + 0.005*"state" + '
 '0.005*"trump" + 0.005*"russia" + 0.005*"syrian" + 0.005*"media" + '
 '0.005*"intelligence" + 0.004*"president"')
('0.015*"trump" + 0.014*"clinton" + 0.007*"president" + 0.005*"election" + '
 '0.005*"wire" + 0.005*"hillary" + 0.005*"campaign" + 0.004*"obama" + '
 '0.004*"century" + 0.004*"state"')


(<gensim.models.ldamulticore.LdaMulticore at 0x7f1c19e7e4a0>,
 <gensim.interfaces.TransformedCorpus at 0x7f1c1606af20>)

In [42]:
top_negative_words_by_subject = {}
for subject in df_fake['subject'].unique():
    subset = df_fake[df_fake['subject'] == subject]
    combined_text = ' '.join(subset['text_cleaned'])
    words = combined_text.split()
    word_counts = Counter(words)
    top_words = [word for word, _ in word_counts.most_common(10)]
    top_negative_words = []
    for word in top_words:
        probabilities = clf.predict_proba(bow_tfidf([word],vectorized)[0])
        negative_index = list(clf.classes_).index('fake')
        negative_percentage = probabilities[0][negative_index] *100
        top_negative_words.append((word, negative_percentage))
    top_negative_words.sort(key=lambda x: x[1], reverse=True)
    top_negative_words = top_negative_words[:5]
    top_negative_words_by_subject[subject] = top_negative_words

for subject, words in top_negative_words_by_subject.items():
    print(f"Subject: {subject}")
    for word, percentage in words:
        print(f"Word: {word}, fake %: {percentage:.2f}%")
    print("\n")

Subject: News
Word: image, fake %: 99.38%
Word: obama, fake %: 98.65%
Word: time, fake %: 98.57%
Word: people, fake %: 91.91%
Word: clinton, fake %: 87.56%


Subject: politics
Word: hillary, fake %: 98.92%
Word: obama, fake %: 98.65%
Word: time, fake %: 98.57%
Word: news, fake %: 97.42%
Word: people, fake %: 91.91%


Subject: Government News
Word: obama, fake %: 98.65%
Word: people, fake %: 91.91%
Word: clinton, fake %: 87.56%
Word: department, fake %: 84.58%
Word: trump, fake %: 84.16%


Subject: left-news
Word: hillary, fake %: 98.92%
Word: obama, fake %: 98.65%
Word: black, fake %: 97.49%
Word: news, fake %: 97.42%
Word: people, fake %: 91.91%


Subject: US_News
Word: wire, fake %: 99.27%
Word: century, fake %: 99.23%
Word: news, fake %: 97.42%
Word: syria, fake %: 91.24%
Word: media, fake %: 88.80%


Subject: Middle-east
Word: wire, fake %: 99.27%
Word: century, fake %: 99.23%
Word: news, fake %: 97.42%
Word: syria, fake %: 91.24%
Word: media, fake %: 88.80%




4. **Dataset Analysis: Focus on Titles**

In [43]:
fake_title_cleaned=data_cleaner(df_fake['title'])

In [44]:
with open('fake_title_cleaned.pkl', 'wb') as f:
    pickle.dump(fake_title_cleaned, f)

In [45]:
with open('fake_title_cleaned.pkl', 'rb') as f:
    fake_title_cleaned = pickle.load(f)

In [49]:
title_words = Counter(word for sublist in sent_to_words(fake_title_cleaned) for word in sublist)
top_words = title_words.most_common(5)
top_words

[('video', 8233),
 ('trump', 7907),
 ('obama', 2542),
 ('hillary', 2270),
 ('clinton', 1118)]

In [50]:
title_vectorized,vectorized=bow_tfidf(fake_title_cleaned, None)
title_vectorized

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [51]:
top_indices = np.argsort(title_vectorized)[0][::-1][:5]
top_words_with_tfidf = [(vectorized.get_feature_names_out()[index], title_vectorized[0, index]) for index in top_indices]
top_words_with_tfidf

[('embarrassing', 0.5787271209859363),
 ('message', 0.5002059241600817),
 ('year', 0.4930444974584706),
 ('donald', 0.37237534192879784),
 ('trump', 0.18196890151842793)]