In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy
import math
import re

from textblob import TextBlob as tb
from nltk.corpus import stopwords

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

# list of english stop words
cachedStopWords = stopwords.words("english")

# adding new 'stop' words to account for acronyms and other informal texting --> likely would need to be updated specific to Trump's tweets
newStopWords = ['lmao', 'lol', 'u', 'ur', 'omg', 'hbu', 'ha', 'haha', 'yo', 'let', 'got', 'im', 'okay', 'alright', 'sure']
for newStopWord in newStopWords:
    cachedStopWords.append(newStopWord)

def removeStopWords(text):
    newText = ' '.join([word for word in text.split() if word not in cachedStopWords])
    return newText

In [10]:
# for Trump tweets, each 'document' would be tweets over a given time period. Probably would be best if it were weeks.
# so like document1 would be tweets from Aug 4, 2019 - Aug 10, 2019 and document2 would be tweets from Aug 11 - Aug 17
document1 = "Now one of the important tasks is to identify the title in the body, if we analyse the documents, there are different patterns of alignment of title. But most of the titles are centre aligned. Now we need to figure out a way to extract the title. But before we get all pumped up and start coding, let us analyse the dataset little deep."
document2 = "Now one of the dumbest tasks is to identify the paragraphs in the body, if we analyse the documents, there are different patterns of alignment of paragraphs. But most of the paragraphs are centre aligned. Now we need to figure out a way to extract the title. But before we get all pumped up and start coding, let us analyse the dataset little deep."
document3 = "Now one of the dumbest tasks is to guess the paragraphs in the body, if we guess the documents, there are different guesses of alignment of paragraphs. But most of the paragraphs are centre aligned. Now we need to guess out a way to guess the title. But before we get all guessed up and start guessing, let us guess the dataset little deep."

documents = [document1,document2,document3]

In [21]:
newDocList = []

# remove stop words, make all characters lower-case, and convert into a TextBlob object
for doc in documents:
    newDocList.append(tb(removeStopWords(doc).lower()))

tfidfList = []
scoreList = []
    
# calculate tf-idf for each month's messages
bloblist = newDocList
for i, blob in enumerate(bloblist):
    
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    scores = []
    words = []
    
    # could also skip this and just append sorted_words to the tfidfList and delete the scoreList. I just did this to keep words and scores seperate
    for word, score in sorted_words[:3]: # the number here is determining how many of the top tf-idf words you want added to the list
        scores.append(score)
        words.append(word)
        
    tfidfList.append(words)
    scoreList.append(scores)
        
data = pd.DataFrame({'document':documents,'docCleaned': newDocList,'tf-idf':tfidfList,'score':scoreList})

In [22]:
data

Unnamed: 0,document,docCleaned,tf-idf,score
0,Now one of the important tasks is to identify ...,"(n, o, w, , o, n, e, , i, m, p, o, r, t, a, ...","[important, titles, identify]","[0.012286821457823163, 0.012286821457823163, 0.0]"
1,Now one of the dumbest tasks is to identify th...,"(n, o, w, , o, n, e, , d, u, m, b, e, s, t, ...","[dumbest, identify, paragraphs]","[0.0, 0.0, 0.0]"
2,Now one of the dumbest tasks is to guess the p...,"(n, o, w, , o, n, e, , d, u, m, b, e, s, t, ...","[guess, guesses, guessed]","[0.061434107289115816, 0.012286821457823163, 0..."
