In [38]:
import json
with open('fox.json') as file:
    data = json.load(file)

Tokenization methods: "Simple split", nltk, and spacy.
Using split now.

In [19]:
# tokenization methods
# 1. split
# Use this one. 2 and 3 are older, update needed.

import string
def tokenize_split(article):
    punct = set(string.punctuation)
    collection = article.split()
    token = [ ''.join( c for c in w if c not in punct ) for w in collection ]
    token = [ w.lower() for w in token ]
    return token
    
# 2. nltk
import nltk
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
def tokenize_nltk(article):
    
    collection = []
    for line in article:
        collection.extend(word_tokenize(line))
    token = [''.join(c for c in s if c not in punct) for s in collection]
    return token

# 3. spacy

import spacy
def tokenize_spacy(article):
    nlp = spacy.load("en_core_web_sm")
    collection = []
    for line in article:
        doc = nlp(line)
        for token in doc:
            collection.append(token.text)
    return collection


[nltk_data] Downloading package punkt to /Users/yhkuo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Stopwords removal.

In [20]:
# Stopwords removal.
nltk.download('stopwords')
from nltk.corpus import stopwords

def removeStopWords(token):
    stopWords = set(stopwords.words('english'))
    return [ c for c in token if c not in stopWords ]

[nltk_data] Downloading package stopwords to /Users/yhkuo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lemmatization
To preserve linguistic meaning, use lemmatization instead of stemming.

In [4]:
# Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize(concisedToken):
    return [ WordNetLemmatizer().lemmatize(c) for c in concisedToken ]


[nltk_data] Downloading package wordnet to /Users/yhkuo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Stemming
from nltk.stem import PorterStemmer

def stemming(concisedToken):
    return [ PorterStemmer().stem(c) for c in concisedToken]

In [6]:
def preprocess(article):
    token = tokenize_split(article)
    concisedToken = removeStopWords(token)
    stems = [ c for c in stemming(concisedToken) if c != '']
    lemmas = [ c for c in lemmatize(concisedToken) if c != '']
    return stems, lemmas

In [31]:
content = data[30]['content']
headline = data[30]['headline']
print(headline)

contentLemma = []
contentStems = []
for line in content:
    stems, lemmas = preprocess(line)
    contentStems.append(stems)
    contentLemma.append(lemmas)
headStems, headLemma = preprocess(headline)

contentLemma = [ sen for sen in contentLemma if sen ]
contentStems = [ sen for sen in contentStems if sen ]

First Israeli from Diamond Princess cruise ship tests positive for coronavirus


Sentence score

In [8]:
def createFrequencyMatrix(contentStems):
    frequency_matrix = {}
    for i in range(len(contentStems)):
        sen = contentStems[i]
        frequency_table = {}
        for word in sen:
            if word in frequency_table:
                frequency_table[word] += 1
            else:
                frequency_table[word] = 1
        frequency_matrix[i] = frequency_table
        
    return frequency_matrix


In [9]:
def createTFMatrix(frequency_matrix):
    tf_matrix = {}

    for i, table in frequency_matrix.items():
        tf_table = {}

        totalCount = len(table)
        for word, count in table.items():
            tf_table[word] = count / totalCount

        tf_matrix[i] = tf_table

    return tf_matrix

In [10]:
import math
def createPerWordTable(frequency_matrix):
    word_table = {}

    for i, table in frequency_matrix.items():
        for word, count in table.items():
            if word in word_table:
                word_table[word] += 1
            else:
                word_table[word] = 1
                
    return word_table

def createIDFMatrix(frequency_matrix, totalDoc):
    idf_matrix = {}

    word_table = createPerWordTable(frequency_matrix)
    
    for i, table in frequency_matrix.items():
        idf_table = {}
        
        for word in table.keys():
            idf_table[word] = math.log10(totalDoc / float(word_table[word]))

        idf_matrix[i] = idf_table

    return idf_matrix
    

In [11]:
def createTF_IDFMatrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    for (i1, table1), (i2, table2) in zip(tf_matrix.items(), idf_matrix.items()):
        tf_idf_table = {}
        for (word1, value1), (word2, value2) in zip(table1.items(),table2.items()):
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[i1] = tf_idf_table

    return tf_idf_matrix

def scoring(tf_matrix, idf_matrix):

    tf_idf_matrix = createTF_IDFMatrix(tf_matrix, idf_matrix)
    
    sentenceValue = {}

    for i, table in tf_idf_matrix.items():
        totalScore = 0

        wordCount = len(table)
        for word, score in table.items():
            totalScore += score

        sentenceValue[i] = totalScore / wordCount

    return sentenceValue

In [12]:
def findThreshold(sentenceValue):
    summation = 0
    for entry in sentenceValue:
        summation += sentenceValue[entry]

    average = (summation / len(sentenceValue))

    return average

In [13]:
def generateSummary(content, contentStems):
    summary = ''
    f_matrix = createFrequencyMatrix(contentStems)
    tf_matrix = createTFMatrix(f_matrix)
    idf_matrix = createIDFMatrix(f_matrix, len(contentStems))
    scores = scoring(tf_matrix, idf_matrix)
    threshold = findThreshold(scores)
    
    for i in range(len(content)):
        if i in scores and scores[i] >= 1.5*(threshold):
            summary += ' ' + content[i]

    return summary

In [32]:
print(headline)
print(generateSummary(content, contentStems))

First Israeli from Diamond Princess cruise ship tests positive for coronavirus
 Officials stressed that the patient did not contract the virus in Israel.


In [33]:
print(content)

['Israel’s Health Ministry confirmed Friday the first case of an Israeli citizen having contracted covid-19 while aboard the Diamond Princess cruise ship, docked in a port in Japan. The female patient is under supervision and in isolation, the ministry said, according to Israeli media.', 'Officials stressed that the patient did not contract the virus in Israel.', 'Eleven Israeli citizens were among the more than 3,000 passengers and crew quarantined on the cruise liner after a coronavirus outbreak on board. In total, 634 of the ship’s occupants have tested positive for the virus and two have died of covid-19, according to Japanese health authorities.', 'The 11 Israelis were flown out of Japan and sent directly Friday into isolation at Sheba Tel Hashomer Hospital, where they will remain for a quarantine period.', 'In an effort to prevent the entry of the virus into Israel, Israel’s government on Monday announced a temporary travel ban on all foreign nationals who in the past 14 days had

In [16]:
###sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

def sentimentAnalysis(headline):
    return SIA().polarity_scores(headline)


In [39]:
# main
from collections import OrderedDict

processed_data = {}
for news in data:
    processed_news = {}
    content = news['content']
    headline = news['headline']
    
    contentLemma = []
    contentStems = []
    for line in content:
        stems, lemmas = preprocess(line)
        if stems and lemmas:
            contentStems.append(stems)
            contentLemma.append(lemmas)
        else:
            content.remove(line)
    
    headStems, headLemma = preprocess(headline)
    summary = generateSummary(content, contentStems)
    sentiment = sentimentAnalysis(headline)
    
    processed_news['headline'] = news['headline']
    processed_news['headline-sentiment'] = sentiment
    processed_news['headline-lemmas'] = headLemma
    processed_news['content'] = news['content']
    processed_news['content-summary'] = summary
    processed_news['content-lemmas'] = contentLemma
    processed_news['journal'] = news['journal']
    processed_news['url'] = news['url']
    
    date = news['time-stamp'][:10]
    if date in processed_data:
        processed_data[date].append(processed_news)
    else:
        processed_data[date] = [processed_news]

result = dict(OrderedDict(sorted(processed_data.items(), key=lambda t: t[0])))
with open('processed_fox.json', 'w') as file:
    json.dump(result, file)