In [14]:
import pandas as pd
import re

import re
import itertools
from nltk import ngrams
from nltk.metrics.distance import jaccard_distance
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
import matplotlib.pyplot as plt
from rake_nltk import Rake
import nltk as nltk
import nltk.corpus

In [9]:
news_path = 'https://storage.googleapis.com/msca-bdp-data-open/news/news_some_company.json'
df = pd.read_json(news_path, orient='records', lines=True)

In [10]:
df.head(10)

Unnamed: 0,crawled,language,text,title
0,2019-05-07T04:18:26.000+03:00,english,No comments\nPacking can be stressful for anyo...,The Most Useful Things I Bring to Disney
1,2019-05-07T04:19:12.028+03:00,english,"I couldn't find another thread for this, so I ...",Walt Disney World 50th Rumored Plans
2,2019-05-07T04:19:38.000+03:00,english,"05-06-2019, 01:01 PM Here we go again with ano...",XFL Strikes Deal with Fox and Disney
3,2019-05-07T04:27:37.005+03:00,english,"Wednesday, July 11, 2018 McDonald's Disney Wor...",McDonald's Disney World Millennium Cups
4,2019-05-07T04:36:07.017+03:00,english,As Disney’s Hollywood Studios celebrated its 3...,Disney World Star Wars: Galaxy's Edge: Hollywo...
5,2019-05-07T04:38:02.018+03:00,english,Hercules and Hades Jack Skellington and Sally\...,Lego Collectibles – Disney Series 2 – Jack and...
6,2019-05-07T04:38:30.015+03:00,english,Jason @disneygeek.com 0 Comments Walt Disney W...,Walt Disney World Resort announces $1.5 millio...
7,2019-05-07T04:40:54.003+03:00,english,,Disney needs balls
8,2019-05-07T04:44:26.009+03:00,english,Enter for your chance to win a trip for four t...,GMA’s Strahan and Sara Disney Trip Sweepstakes
9,2019-05-07T04:44:29.014+03:00,english,Ratman Raving Rabbids TV Party $15\nDisney Inf...,"Nintendo Wii games. Mario, Harry, Disney Infin..."


In [11]:
## Remove NA rows
df = df.dropna(subset=['text'])

## Remove URLs and Mentions
df["text_cleaned"] = df['text'].apply(lambda  x: re.sub(r'(?:\@|http?\://|https?\://|www)\S+', '', str(x)))

## Remove new lines
df["text_cleaned"] = df["text_cleaned"].apply(lambda x: re.sub(r'(?:\n)','|', str(x)))

In [12]:
# Function to tokenize and clean a single tweet
def tokenize_and_clean(news):
    # Initialize TweetTokenizer
    news_tokenizer = RegexpTokenizer(r'\w+')
    
    # Tokenize the tweet
    words = news_tokenizer.tokenize(news)

    # Set of stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))

    # Cleaning steps
    words = [word for word in words if len(word) > 1]  # Remove single-character tokens
    words = [word for word in words if not word.isnumeric()]  # Remove numbers
    words = [word.lower() for word in words]  # Lowercase
    words = [word for word in words if not word.startswith('#')]  # Remove hashtags
    words = [word for word in words if word not in stopwords]  # Remove stopwords

    return words

# Apply the function to each row in the DataFrame
df['text_cleaned_tokens'] = df['text_cleaned'].apply(tokenize_and_clean)

In [13]:
df.head(10)

Unnamed: 0,crawled,language,text,title,text_cleaned,text_cleaned_tokens
0,2019-05-07T04:18:26.000+03:00,english,No comments\nPacking can be stressful for anyo...,The Most Useful Things I Bring to Disney,No comments|Packing can be stressful for anyon...,"[comments, packing, stressful, anyone, right, ..."
1,2019-05-07T04:19:12.028+03:00,english,"I couldn't find another thread for this, so I ...",Walt Disney World 50th Rumored Plans,"I couldn't find another thread for this, so I ...","[find, another, thread, apologize, duplicated,..."
2,2019-05-07T04:19:38.000+03:00,english,"05-06-2019, 01:01 PM Here we go again with ano...",XFL Strikes Deal with Fox and Disney,"05-06-2019, 01:01 PM Here we go again with ano...","[pm, go, another, professional, football, leag..."
3,2019-05-07T04:27:37.005+03:00,english,"Wednesday, July 11, 2018 McDonald's Disney Wor...",McDonald's Disney World Millennium Cups,"Wednesday, July 11, 2018 McDonald's Disney Wor...","[wednesday, july, mcdonald, disney, world, mil..."
4,2019-05-07T04:36:07.017+03:00,english,As Disney’s Hollywood Studios celebrated its 3...,Disney World Star Wars: Galaxy's Edge: Hollywo...,As Disney’s Hollywood Studios celebrated its 3...,"[disney, hollywood, studios, celebrated, 30th,..."
5,2019-05-07T04:38:02.018+03:00,english,Hercules and Hades Jack Skellington and Sally\...,Lego Collectibles – Disney Series 2 – Jack and...,Hercules and Hades Jack Skellington and Sally|...,"[hercules, hades, jack, skellington, sally, pr..."
6,2019-05-07T04:38:30.015+03:00,english,Jason @disneygeek.com 0 Comments Walt Disney W...,Walt Disney World Resort announces $1.5 millio...,Jason 0 Comments Walt Disney World Resort ann...,"[jason, comments, walt, disney, world, resort,..."
7,2019-05-07T04:40:54.003+03:00,english,,Disney needs balls,,[]
8,2019-05-07T04:44:26.009+03:00,english,Enter for your chance to win a trip for four t...,GMA’s Strahan and Sara Disney Trip Sweepstakes,Enter for your chance to win a trip for four t...,"[enter, chance, win, trip, four, aulani, disne..."
9,2019-05-07T04:44:29.014+03:00,english,Ratman Raving Rabbids TV Party $15\nDisney Inf...,"Nintendo Wii games. Mario, Harry, Disney Infin...",Ratman Raving Rabbids TV Party $15|Disney Infi...,"[ratman, raving, rabbids, tv, party, disney, i..."


In [15]:
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

def rake_implement(x,r):
    r.extract_keywords_from_text(x) # r.extract_keywords_from_text(<text to process>)
    return r.get_ranked_phrases() # r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.