In [12]:
import pandas as pd
import math
from textblob import TextBlob

In [3]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/news_some_company.json'
df = pd.read_json(news_path,orient='records',lines=True)

In [9]:
df[((df['title'].str.contains('Florida')) | (df['text'].str.contains('Florida'))) & (~(df['title'].str.contains('Disney')) | (~df['text'].str.contains('Disney'))) ]

Unnamed: 0,crawled,language,text,title
126,2019-05-07T11:15:50.008+03:00,english,MisterPenguin said: Here's an overlay...\nView...,Reflections – A Disney Lakeside Lodge (Project...
411,2019-05-07T20:57:58.012+03:00,english,TrainChasers said: Makes your state seem reall...,Great-grandmother arrested for having CBD oil ...
721,2019-05-07T23:49:35.019+03:00,english,"cgattis said: ↑ Good grief, how old ARE you??!...",Moving To Florida And Visiting WDW More Often ...
1398,2019-05-08T15:59:46.011+03:00,english,jproff09 said: How often do you people think i...,News - New Gondola Transportation - Disney Sky...
1455,2019-05-08T17:31:57.008+03:00,english,LAKid53 said: Florida is really 2 different st...,Great-grandmother arrested for having CBD oil ...
2043,2019-05-09T05:14:38.001+03:00,english,Hester Jordan Burkhalter spent the day in jail...,Great-Grandma Arrested at Disney World for Hav...


In [18]:
# http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/

def tf(word, blob):
    if len(blob.words)!=0:
        return blob.words.count(word) / len(blob.words)
    else:
        return 0
# tf(word, blob) computes "term frequency" which is the number of times a word appears in a document blob, 
# normalized by dividing by the total number of words in blob. We use TextBlob for breaking up the text into words 
# and getting the word counts.


def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)
# n_containing(word, bloblist) returns the number of documents containing word. 
# A generator expression is passed to the sum() function.


def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
# idf(word, bloblist) computes "inverse document frequency" which measures how common a word is 
# among all documents in bloblist. The more common a word is, the lower its idf. 
# We take the ratio of the total number of documents to the number of documents containing word, 
# then take the log of that. Add 1 to the divisor to prevent division by zero


def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)
# tfidf(word, blob, bloblist) computes the TF-IDF score. It is simply the product of tf and idf.

In [13]:
bloblist = []
del bloblist[:]

for i  in range(0,len(df['text'])):
    bloblist.append(TextBlob(df['text'].iloc[i]))
    
len(bloblist)  

2931

In [14]:
def get_blob(string):
    blob = TextBlob(string)
    value =''
    scores = {word: tfidf(word, blob, bloblist) for word in string}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:5]:
        value = value + "\n" + "Word: {}, TF-IDF: {}".format(word, round(score, 5))
    return value

In [22]:
from tqdm.auto import tqdm
tqdm.pandas()

In [24]:
for i, blob in enumerate(bloblist):
# Print top 5 values
    if i == 5:
        break
    print("Top words in tweet {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:5]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in tweet 1
	Word: And, TF-IDF: 0.06054
	Word: You, TF-IDF: 0.05518
	Word: Have, TF-IDF: 0.0524
	Word: I, TF-IDF: 0.04151
	Word: FuelRod, TF-IDF: 0.03895
Top words in tweet 2
	Word: 50th, TF-IDF: 0.08855
	Word: I, TF-IDF: 0.06843
	Word: castle, TF-IDF: 0.06629
	Word: guess, TF-IDF: 0.05982
	Word: Castle, TF-IDF: 0.05982
Top words in tweet 3
	Word: McMahon, TF-IDF: 0.21012
	Word: league, TF-IDF: 0.20411
	Word: 05-06-2019, TF-IDF: 0.08011
	Word: go-around, TF-IDF: 0.08011
	Word: gimmicks, TF-IDF: 0.08011
Top words in tweet 4
	Word: Cups, TF-IDF: 0.07644
	Word: cups, TF-IDF: 0.06684
	Word: And, TF-IDF: 0.06049
	Word: To, TF-IDF: 0.0499
	Word: EPCOT, TF-IDF: 0.04029
Top words in tweet 5
	Word: Fullscreen, TF-IDF: 0.10776
	Word: And, TF-IDF: 0.05067
	Word: A, TF-IDF: 0.03198
	Word: In, TF-IDF: 0.03015
	Word: Land, TF-IDF: 0.02982


In [23]:
df['topic'] = df['text'].progress_apply(get_blob)

  0%|          | 0/2931 [00:00<?, ?it/s]

KeyboardInterrupt: 