# Spark Streaming for New York Times articles

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

In [2]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [3]:
sc = SparkContext(appName="SparkStreamingKafka")
sc.setLogLevel("WARN")

In [12]:
ssc = StreamingContext(sc, 20)
kafkaStream = KafkaUtils.createStream(ssc, 'ece1.adaltas.com:9093', 'spark-streaming', {'nyt_articles':1}) 

## TF/IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
vect = TfidfVectorizer(lowercase=True,stop_words='english')
sentences = kafkaStream.flatMap(lambda article: (article[0], sent_tokenize(article[1]))) 
tfidf_mat = sentences.map(lambda text: (text[0], vect.fit_transform(text[1])))

## Cosine Similarity to reduce text

In [14]:
from sklearn.metrics.pairwise import cosine_distances
all_articles_summarized = []
for article in tfidf_mat:
    for i, sent in enumerate(article[1,:-2]): # for all sentences until the ante penultiem since we will use te n+1 article each time
        text_summarized = []
        if(cosine_distances(sent, article[1, i])<0.6): # if the similarity between this sentence and the following is lower than 0.6
            text_summarized.add(sent)
    all_articles_summarized.add(article[0], text_summarized)