In [None]:
from __future__ import division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import re


In [None]:
try:
    sc = SparkContext(conf=SparkConf().setAppName("RiMBD_lab").setMaster("local").set("spark.cores.max", "4"))
except:
    pass

In [None]:
try:
    spark = SparkSession.builder.master("local").appName("RiMBD_lab").config("spark.some.config.option", "some-value").getOrCreate()
except:
    pass


In [None]:
spark

In [None]:
sc

In [None]:
#Równoległe wywołanie polecenia range
#wyjście DataFrame 

firstDataFrame = spark.range(1000000)
#display(firstDataFrame)
firstDataFrame.show(5)

In [None]:
# Przykładowa transformacja:
# mnożenie kol ID przez 2
secondDataFrame = firstDataFrame.selectExpr("(id * 2) as value")


In [None]:
secondDataFrame.show(n=5)
secondDataFrame.take(5)

In [None]:
import os
notebook_path = os.path.abspath("DemoNotebook.ipynb")
print (notebook_path)

Kilka akcji, które możemy wykonać na RDD (zwracają wynik):

In [None]:
#Tworzenie nowego RDD z pliku tekstowego bbc_news.txt:    
bbc_news = sc.textFile("/home/jovyan/bbc_news.txt")

In [None]:
# Number of items in this RDD
bbc_news.count()  

In [None]:
# First item in this RDD
bbc_news.first()  

In [None]:
#usuwanie pustych linii
bbc_news = bbc_news.filter(lambda x: x != "")
bbc_news.first() 

In [None]:
# Number of items in this RDD
bbc_news.count()  

Oraz transformacje (zwracają wskaźnik do nowego RDD):

In [None]:
#Liczba wystąpień słowa 'black'
linesWithBlack = bbc_news.filter(lambda line: "black" in line)
print('To jest nowe RDD: ')
print (linesWithBlack)
print('Lines with \'black\': '+ str(linesWithBlack.count()))

#Liczba wystąpień słowa 'police'
linesWithPolice = bbc_news.filter(lambda line: "police" in line)
print('Lines with \'police\': '+ str(linesWithPolice.count()))


Klasyczne zliczanie wystąpień słów:

In [None]:

wordCounts = bbc_news.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)

#wordCounts.collect() #unordered


wordCounts.takeOrdered(15, key=lambda(k, v): -v)

In [None]:
#Download stops words
!wget http://tacit.usc.edu/resources/stopwords_eng.txt

In [None]:
sw = sc.textFile('/home/jovyan/stopwords_eng.txt')
sw.take(10)

In [None]:
sw_collected= sw.collect()
no_stop_words_wordCounts = (wordCounts.filter(lambda (k,v): k not in sw_collected))

no_stop_words_wordCounts.takeOrdered(15, key=lambda(k, v): -v)


In [None]:
wordCounts.takeOrdered(15, key=lambda(k, v): -v)

## Cache'owanie

In [None]:
bbc_news.cache()  

In [None]:
bbc_news.count()  

In [None]:
bbc_news.count()  

In [None]:
# dane:
df_list = [('Polska', 'Warszawa', 1),
           ('Polska', 'Krakow', 2),
           ('Niemcy', 'Berlin', 1),
           ('Niemcy', 'Hamburg', 3),
           ('Czechy', 'Praga', 1),
           ('Rosja', 'Moskwa', 1),
           ('Francja', 'Beauvais', 7)]
 
# budowanie data frame 
df = spark.createDataFrame(df_list,['kraj', 'miasto', 'liczba'])
 
# wyświetlenie
df.show()

## SQL

In [None]:
# tworzenie widoku SQL z DataFrame
df_sql = df.createOrReplaceTempView("dane")
 
# tworzenie zapytań SQL
sql_results = spark.sql("SELECT * FROM dane WHERE kraj='Polska'")
 
sql_results

In [None]:
sql_results.show()

In [None]:
def parse_article(line):
    try:
        article_id, text = line.rstrip().encode('utf-8').split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return words
    except ValueError as e:
        return []

In [None]:
#Nowe RDD zawierające artykuły z wikipedii:    

wiki = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).map(parse_article)
result = wiki.take(1)[0]


In [None]:
for word in result[:50]:
    print word

In [None]:
def pairs_starting_from_word(words, first_word='word'):
    pairs = []
    
    for i, word in enumerate(words[:-1]):
        if (word == first_word):
            pair = '{} — {}'.format(word, words[i+1])
            cnt = 1
            pairs.append((pair, cnt))
        else:
            continue
    return pairs



In [None]:
# lowercase all words
wiki_lower = wiki.map(lambda words: [x.lower() for x in words])

In [None]:
# find pairs starting from defined word
wiki_pairs = wiki_lower.flatMap(lambda x: pairs_starting_from_word(x, 'enemy'))

In [None]:
# filtering empty elements
wiki_pairs = wiki_pairs.filter(lambda x: x != [])


In [None]:
# aggregate counters
wiki_red = wiki_pairs.reduceByKey(lambda a, b: a + b, numPartitions=16)

In [None]:
# sort values by key
wiki_red_sorted = wiki_red.sortByKey()

In [None]:
result = wiki_red_sorted.takeOrdered(25, key=lambda(k, v): -v)
for pair, cnt in result:
    print '{}\t{}'.format(pair, cnt)