In [1]:
from pyspark import SparkConf, SparkContext
import collections

conf = SparkConf().setMaster('local').setAppName('RatingsHistogram')
sc = SparkContext(conf = conf)

lines = sc.textFile('u.data')

In [2]:
lines.take(3)

['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']

In [3]:
ratings = lines.map(lambda x: x.split()[2])

In [4]:
ratings.take(6)

['3', '3', '1', '2', '1', '4']

In [5]:
results = ratings.countByValue()

In [6]:
results

defaultdict(int, {'1': 6110, '2': 11370, '3': 27145, '4': 34174, '5': 21201})

In [7]:
sortedResults = collections.OrderedDict(sorted(results.items()))
for key, value in sortedResults.items():
    print('%s %i' % (key, value))

1 6110
2 11370
3 27145
4 34174
5 21201


In [8]:
sc.stop()

In [9]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('FriendsByAge')
sc = SparkContext(conf = conf)

In [10]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    
    return (age, numFriends)

lines = sc.textFile('fakefriends.csv')

In [11]:
lines.take(3)

['0,Will,33,385', '1,Jean-Luc,26,2', '2,Hugh,55,221']

In [12]:
rdd = lines.map(parseLine)
rdd.take(3)

[(33, 385), (26, 2), (55, 221)]

In [13]:
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
totalsByAge.take(3)

[(33, (3904, 12)), (26, (4115, 17)), (55, (3842, 13))]

In [14]:
averagesByAge = totalsByAge.mapValues(lambda x: (x[0] / x[1]))
averagesByAge.take(3)

[(33, 325.3333333333333), (26, 242.05882352941177), (55, 295.53846153846155)]

In [15]:
results = averagesByAge.collect()
for result in results:
    print(result)

(33, 325.3333333333333)
(26, 242.05882352941177)
(55, 295.53846153846155)
(40, 250.8235294117647)
(68, 269.6)
(59, 220.0)
(37, 249.33333333333334)
(54, 278.0769230769231)
(38, 193.53333333333333)
(27, 228.125)
(53, 222.85714285714286)
(57, 258.8333333333333)
(56, 306.6666666666667)
(43, 230.57142857142858)
(36, 246.6)
(22, 206.42857142857142)
(35, 211.625)
(45, 309.53846153846155)
(60, 202.71428571428572)
(67, 214.625)
(19, 213.27272727272728)
(30, 235.8181818181818)
(51, 302.14285714285717)
(25, 197.45454545454547)
(21, 350.875)
(42, 303.5)
(49, 184.66666666666666)
(48, 281.4)
(50, 254.6)
(39, 169.28571428571428)
(32, 207.9090909090909)
(58, 116.54545454545455)
(64, 281.3333333333333)
(31, 267.25)
(52, 340.6363636363636)
(24, 233.8)
(20, 165.0)
(62, 220.76923076923077)
(41, 268.55555555555554)
(44, 282.1666666666667)
(69, 235.2)
(65, 298.2)
(61, 256.22222222222223)
(28, 209.1)
(66, 276.44444444444446)
(46, 223.69230769230768)
(29, 215.91666666666666)
(18, 343.375)
(47, 233.22222222222

In [16]:
sc.stop()

In [17]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('minTemperatures')
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0/5.0) + 32.0
    return (stationID, entryType, temperature)

lines = sc.textFile('1800.csv')
lines.take(3)

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,']

In [18]:
parsedLines = lines.map(parseLine)
parsedLines.take(3)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0)]

In [19]:
minTemps = parsedLines.filter(lambda x: 'TMIN' in x[1])
minTemps.take(3)

[('ITE00100554', 'TMIN', 5.359999999999999),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMIN', 9.5)]

In [20]:
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
stationTemps.take(3)

[('ITE00100554', 5.359999999999999),
 ('EZE00100082', 7.699999999999999),
 ('ITE00100554', 9.5)]

In [21]:
stationTemps.count()

730

In [22]:
minTemps = stationTemps.reduceByKey(lambda x, y: min(x, y))
minTemps.collect()

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999)]

In [23]:
results = minTemps.collect()
for result in results: 
    print(result[0] + '\t{:.2f}F'.format(result[1]))

ITE00100554	5.36F
EZE00100082	7.70F


In [24]:
sc.stop()

In [25]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('maxTemperatures')
sc = SparkContext(conf = conf)

In [26]:
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0/5.0) + 32
    
    return (stationID, entryType, temperature)
lines = sc.textFile('1800.csv')
lines.take(3)

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,']

In [27]:
parsedLines = lines.map(parseLine)
parsedLines.take(3)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0)]

In [28]:
maxTemps = parsedLines.filter(lambda x: 'TMAX' in x[1])
maxTemps.take(3)

[('ITE00100554', 'TMAX', 18.5),
 ('EZE00100082', 'TMAX', 16.52),
 ('ITE00100554', 'TMAX', 21.2)]

In [29]:
stationTemps = maxTemps.map(lambda x: (x[0], x[2]))
stationTemps.take(3)

[('ITE00100554', 18.5), ('EZE00100082', 16.52), ('ITE00100554', 21.2)]

In [30]:
maxTemps = stationTemps.reduceByKey(lambda x, y: max(x, y))
maxTemps.take(3)

[('ITE00100554', 90.14000000000001), ('EZE00100082', 90.14000000000001)]

In [31]:
results = maxTemps.collect()
for result in results:
    print(result[0] + '\t{:.2f}F'.format(result[1]))

ITE00100554	90.14F
EZE00100082	90.14F


In [32]:
sc.stop()

In [6]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('RedFox')
sc = SparkContext(conf = conf)

lines = sc.textFile('redfox.txt')
lines.collect()

['The quick red fox jumped over the lazy brown dogs']

In [8]:
rageCaps = lines.map(lambda x: x.upper())
rageCaps.collect()

['THE QUICK RED FOX JUMPED OVER THE LAZY BROWN DOGS']

In [9]:
words = lines.flatMap(lambda x: x.split())
words.collect()

['The',
 'quick',
 'red',
 'fox',
 'jumped',
 'over',
 'the',
 'lazy',
 'brown',
 'dogs']

In [10]:
sc.stop()

In [11]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('WordCount')
sc = SparkContext(conf = conf)

input = sc.textFile('book.txt')
words = input.flatMap(lambda x: x.split())
wordCounts = words.countByValue()

In [15]:
for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(word, count)

Self-Employment: 1
Building 5
an 172
Internet 13
Business 19
of 941
One 12
Achieving 1
Financial 3
and 901
Personal 3
Freedom 7
through 55
a 1148
Lifestyle 5
Technology 2
By 9
Frank 10
Kane 7
Copyright 1
2015 3
Kane. 1
All 13
rights 3
reserved 2
worldwide. 2
CONTENTS 1
Disclaimer 1
Preface 1
Part 2
I: 2
Making 5
the 1176
Big 1
Decision 1
Overcoming 1
Inertia 1
Fear 1
Failure 1
Career 1
Indoctrination 2
The 88
Carrot 1
on 399
Stick 2
Ego 1
Protection 1
Your 62
Employer 2
as 297
Security 2
Blanket 1
Why 3
it�s 28
Worth 1
it 311
Unlimited 2
Growth 4
Potential 1
Investing 3
in 552
Yourself, 1
Not 7
Someone 2
Else 1
No 14
Dependencies 1
Commute 1
to 1789
Live 3
Where 2
You 144
Want 5
Work 4
When 31
How 29
Is 17
Self-Employment 1
for 500
You? 1
Flowchart: 1
Should 3
I 322
Even 35
Consider 5
Self-Employment? 2
Having 2
Safety 2
Net 2
Planning 3
Health 2
Care 2
Self-Assessment 1
Quiz 1
PART 5
II: 2
Happen 1
Designing 1
Fallacy 1
Introducing 1
Ideal 1
Case 1
Study: 1
Sundog 20
Software 12
Other

gas 1
parking? 1
tolls? 1
Maintenance 1
Bus 1
fares? 1
Really� 1
add 7
amazed 3
spent, 1
earnings 1
Eliminating 1
(You'll 1
less, 1
drain 3
realize, 2
health.) 1
FREEDOM 3
TO 6
LIVE 2
WHERE 1
YOU 4
WANT 3
are? 1
Chances 1
near 5
office. 5
Unfortunately, 2
high-tech, 1
high-paying 1
tend 8
concentrated 2
places 3
exist 2
San 1
Jose, 1
California 1
area5, 1
49.3% 1
higher 8
average6. 1
six-figure 1
Silicon 6
Valley, 3
engineer 3
Austin, 1
Texas 1
discretionary 1
do7 1
mention 7
commute. 3
Much 1
currently 3
housing 2
wasted, 1
somewhere 1
nicer 1
depends 3
days 7
conducted 1
online. 6
fact, 5
desirable 1
model 1
opens 1
marketplace, 2
limiting 2
area. 3
wherever 3
want, 4
perhaps 9
right 20
kind 5
business. 40
changes 2
kick 1
in. 10
move 12
tropics, 1
living? 1
big, 2
expensive 8
metropolitan 1
area 10
to. 8
save, 2
standard 3
enjoy, 2
moving 3
houses 1
half 5
shouldn't 2
for-sale 1
sign 4
front 7
lawn 1
yet 6
place 12
employment 10
opportunities. 1
employers 3
backup 5
plans 4
doesn't 

bit. 1
Also, 4
vigilant 1
unsolved 1
problems 12
encounter 2
searched 4
unavailable 1
find? 1
waste 5
you'd 7
back? 2
opportunities 7
fill 5
eventually 3
excited 4
KEY 1
POINTS 1
EVALUATING 1
IDEA 1
reasons: 1
one, 3
evaluates 1
eliminate 3
worst 2
Two, 1
validates 2
help. 5
easier 8
self-fund, 1
keeps 4
low, 1
frees 1
agreement, 2
won�t 3
violate 1
well? 1
manufactured 1
developed, 1
PR, 3
further. 1
wish. 1
worse, 1
billed 1
hourly 3
Products 1
information 17
limit, 1
fixed 3
quantity. 1
niche? 1
observation 1
realistic 9
3D 2
clouds 2
produce 4
simulators 3
games. 2
Yet, 1
deep 1
pockets 1
behind 3
solutions 2
solve, 1
yes, 1
bills. 2
Estimating 1
sizes 2
Websites 2
Alexa, 1
keyword 3
tool 4
AdWords, 6
useful 8
measuring 6
trends 10
overall 1
topic. 2
key 4
estimate 13
bottom-up, 1
top-down. 2
X% 1
billion 2
grossly 2
overestimate 2
niches, 1
shell 2
hard-earned 1
competitors 4
niche. 2
differentiate 1
itself? 1
consume 1
initially, 2
term? 1
yours, 4
projections. 1
field 2
unto 1
i

live. 1
"come 1
out," 1
kept 2
private. 1
silly 1
placing 1
pizza 1
delivery 1
addresses 6
targets, 2
SWAT 1
alleged 1
identity 1
PERSONAL 1
RISK 1
MITIGATION 1
So 2
encouraging? 1
Congratulations! 1
Seriously, 1
night 1
proud 1
notice 1
place. 7
tolerance 1
entrepreneur. 1
bust, 1
benefits12 1
crucial. 2
section, 3
written 3
picture 1
expected 4
expenses, 2
emergency 2
hand, 1
taken 6
months' 1
properly 3
factoring 1
incur. 1
update 3
comfortable 5
tax. 1
amount. 1
being. 1
strategies 1
(such 2
filing 1
S-corp 1
individual) 1
analyze 2
disability 2
subsidized 1
Commercial 1
carry 2
general 6
rent. 1
practical 2
reason, 3
region, 1
around. 1
Dental 2
Plans 1
discount 1
participating 1
dentists 1
option, 1
cleanings 2
dental 3
pocket. 1
braces, 1
longer. 1
X-Rays, 1
coverage! 1
surprises 1
cropping 1
plans. 2
pair 1
glasses 1
lenses, 1
eye 3
exam, 1
Ongoing 1
amount, 1
abstract 1
"Oh, 1
whole 5
healthy, 1
much." 1
budgeting 1
health-related 1
medication? 1
periodic 2
doctor 1
visits 3
p

55 1
characters 1
short. 1
Webmaster 2
Tools 2
crawls 1
essentially 2
invisible 1
SEO. 2
for; 1
"adaptive" 1
lays 1
screens. 1
Load 1
smartphone� 1
says, 1
Site 1
speed 1
caching 1
careers 2
SEO, 1
moz.com 1
(http://moz.com/learn/seo) 1
resource 3
LANDING 1
PAGE 1
discovers 1
page. 2
great, 1
yet� 1
visitor 1
customer! 1
away, 1
pages, 1
simulating 1
sky 1
clouds: 1
Notice 1
text� 1
points 2
convey 1
orange 1
button 3
inviting 1
do� 1
eliminating 1
menu 1
"site 1
navigation" 1
action. 1
offerings. 1
down, 1
fold, 1
dominates 1
auto-plays 1
loaded. 1
customer-focused; 2
clearly 2
concisely 1
communicated. 1
shelf, 1
lead 4
information; 1
requesting 1
newsletter 1
list. 2
wary 1
spam 1
I'd 1
scared 1
away� 1
optional. 1
submit 1
undeliverable 1
addresses, 1
harm. 1
Relevance 1
LPO. 1
kind. 1
specialized 1
focuses 1
interest. 2
words 1
ad� 1
twofold. 1
slightly 1
discipline 2
Moz.com 1
IMPORTANCE 1
EMAIL 1
CAMPAIGNS 1
reaching 3
customers� 2
emails� 1
spam-filtered. 1
beginners, 1
party 1

In [16]:
sc.stop()

In [20]:
import re
from pyspark import SparkConf, SparkContext

def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

conf = SparkConf().setMaster('local').setAppName('WordCount')
sc = SparkContext(conf = conf)
input = sc.textFile('book.txt')
words = input.flatMap(normalizeWords)
wordCounts = words.countByValue()
for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if cleanWord:
        print(word, count)    

self 111
employment 75
building 33
an 178
internet 26
business 383
of 970
one 100
achieving 1
financial 17
and 934
personal 48
freedom 41
through 57
a 1191
lifestyle 44
technology 11
by 122
frank 11
kane 10
copyright 3
2015 4
all 137
rights 3
reserved 2
worldwide 4
contents 1
disclaimer 2
preface 2
part 33
i 387
making 25
the 1292
big 42
decision 12
overcoming 2
inertia 2
fear 3
failure 3
career 31
indoctrination 5
carrot 4
on 428
stick 6
ego 3
protection 7
your 1420
employer 44
as 343
security 8
blanket 2
why 25
it 649
s 391
worth 39
unlimited 6
growth 39
potential 38
investing 16
in 616
yourself 78
not 203
someone 62
else 33
no 76
dependencies 6
commute 14
to 1828
live 25
where 53
you 1878
want 122
work 144
when 102
how 163
is 560
for 537
flowchart 4
should 69
even 104
consider 26
having 30
safety 7
net 13
planning 16
health 35
care 24
assessment 4
quiz 4
ii 2
happen 13
designing 4
fallacy 2
introducing 3
ideal 3
case 26
study 4
sundog 24
software 60
other 78
ideas 27
key 6
points 5


maintain 10
wardrobe 1
regardless 5
helps 4
frame 1
mind 17
forms 6
cannot 9
environment 1
unbounded 1
serious 1
assess 1
diving 2
undertaking 1
likelihood 2
guarantees 1
exercises 1
chapter 1
stand 2
unimaginably 1
badly 1
favor 4
conventional 3
savings 9
ensuring 3
cushion 1
expenses 38
truly 3
betting 1
farm 5
identify 8
store 2
crazy 2
device 1
display 18
main 6
assuming 1
initial 27
legwork 1
month 18
decrease 3
estimate 14
particular 2
experience 12
burning 1
monthly 10
bust 2
prove 2
testing 3
factor 5
supportive 2
during 11
talking 10
withdraw 1
penalties 1
mistake 7
adding 3
basic 6
cell 1
phone 9
bill 4
etc 6
tighten 1
belt 1
bank 8
statements 3
multiply 1
account 30
looming 1
ahead 6
payments 2
future 15
bonus 2
refund 1
fund 7
endeavor 5
contract 6
spins 1
wouldn 5
contracts 4
bad 13
trouble 4
collecting 4
invoices 2
timely 3
manner 6
billable 2
saddled 1
depleted 1
uncle 2
bob 2
bet 7
god 1
sake 3
max 1
cards 2
remember 23
succeeds 3
interest 13
avoided 1
severance 1
packa

incubation 3
program 5
commission 3
fostering 1
cities 3
programs 2
university 3
central 3
consultations 1
designers 1
officials 1
plunge 1
locale 1
angel 1
proceeding 1
accepting 1
investments 1
regularly 1
express 1
forum 1
http 9
openforum 1
contradicts 1
distraction 1
listening 1
gospel 2
weigh 1
similarity 1
distance 1
horror 2
handing 1
ownership 7
promptly 1
disappears 2
signed 4
shortage 1
bizarre 1
drafted 1
explicit 2
retain 4
uncommon 1
leaving 7
tender 1
gathered 1
expanding 1
copyrights 2
filed 1
moment 3
burn 3
deductibles 1
opened 1
selected 1
explored 1
date 1
checkups 1
exams 2
refilled 1
purchases 1
finalized 1
via 1
collected 1
belongings 4
brought 1
nature 2
delete 1
ducks 1
row 1
lined 1
bridges 1
scenario 1
former 3
welcomes 1
arms 2
grown 2
gone 2
appreciates 1
restate 1
separation 1
experimenting 2
burned 1
bridge 1
managers 1
appeal 2
feelings 2
upbeat 1
dangle 1
incentives 1
retention 1
bonuses 2
raises 2
wondering 1
dreaming 1
scare 1
throwing 2
genuinely 3
a

In [21]:
sc.stop()

In [22]:
from pyspark import SparkConf, SparkContext
import re

def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

conf = SparkConf().setMaster('local').setAppName('WordCount')
sc = SparkContext(conf = conf)
input = sc.textFile('book.txt')
words = input.flatMap(normalizeWords)

In [23]:
words.take(10)

['self',
 'employment',
 'building',
 'an',
 'internet',
 'business',
 'of',
 'one',
 'achieving',
 'financial']

In [24]:
wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
wordCounts.take(10)

[('self', 111),
 ('employment', 75),
 ('building', 33),
 ('an', 178),
 ('internet', 26),
 ('business', 383),
 ('of', 970),
 ('one', 100),
 ('achieving', 1),
 ('financial', 17)]

In [30]:
wordCountsSorted = wordCounts.map(lambda x: (x[1], x[0])).sortByKey()
wordCountsSorted.take(10)

[(1, 'achieving'),
 (1, 'contents'),
 (1, 'preparation'),
 (1, 'skillset'),
 (1, 'determination'),
 (1, 'confidence'),
 (1, 'strike'),
 (1, 'blame'),
 (1, 'devoted'),
 (1, 'commuted')]

In [32]:
results = wordCountsSorted.collect()
for result in results:
    count = str(result[0])
    word = result[1].encode('ascii', 'ignore')
    if (word):
        print(word.decode() + ':\t\t' + count)

achieving:		1
contents:		1
preparation:		1
skillset:		1
determination:		1
confidence:		1
strike:		1
blame:		1
devoted:		1
commuted:		1
complaint:		1
rewarded:		1
role:		1
marriage:		1
combat:		1
secondary:		1
ultimatum:		1
weeks:		1
walked:		1
matches:		1
nor:		1
requirement:		1
43:		1
broke:		1
gloat:		1
heart:		1
attack:		1
65:		1
retire:		1
smarts:		1
tenacity:		1
discarding:		1
bold:		1
psyche:		1
rebel:		1
magnitude:		1
justify:		1
surprising:		1
fell:		1
laid:		1
downsized:		1
imagined:		1
pursuing:		1
roof:		1
starving:		1
foreclosed:		1
returning:		1
prototyped:		1
proven:		1
quits:		1
sba:		1
tenure:		1
secure:		1
28:		1
moderately:		1
americans:		1
representing:		1
14:		1
fringe:		1
internal:		1
religious:		1
cults:		1
brainwashing:		1
beliefs:		1
questioning:		1
instill:		1
youth:		1
grew:		1
teachers:		1
grades:		1
graduated:		1
absorbed:		1
culture:		1
promotes:		1
indoctrinated:		1
landed:		1
children:		1
terrifying:		1
barely:		1
mouths:		1
fulfill:		1
thrust:		1
foam:		

expect:		11
average:		11
seem:		11
early:		11
bills:		11
rewards:		11
startup:		11
high:		11
running:		11
close:		11
call:		11
goals:		11
accounting:		11
service:		11
contractor:		11
between:		11
invest:		11
larger:		11
front:		11
during:		11
research:		11
size:		11
section:		11
searching:		11
digital:		11
interested:		11
test:		11
technical:		11
publications:		11
directly:		11
recommend:		11
patent:		11
came:		11
highly:		11
linkedin:		11
visitors:		11
ocean:		11
include:		11
cpa:		11
countries:		11
decision:		12
licenses:		12
basics:		12
growing:		12
everything:		12
am:		12
needed:		12
stress:		12
against:		12
freelancers:		12
chance:		12
advantage:		12
almost:		12
limited:		12
comes:		12
resources:		12
makes:		12
scale:		12
move:		12
emails:		12
sense:		12
experience:		12
budget:		12
cover:		12
automated:		12
community:		12
equipment:		12
valuable:		12
realistic:		12
opportunity:		12
channels:		12
network:		12
links:		12
link:		12
target:		12
net:		13
happen:		13
engine:		13
matter:

In [49]:
sc.stop()

In [50]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('SpendByCustomer')
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

lines = sc.textFile('customer-orders.csv')

In [51]:
lines.take(3)

['44,8602,37.19', '35,5368,65.89', '2,3391,40.64']

In [52]:
rdd = lines.map(parseLine)
rdd.take(3)

[(44, 37.19), (35, 65.89), (2, 40.64)]

In [53]:
totalByCustomer = rdd.reduceByKey(lambda x, y: x+y)
totalByCustomer.take(3)

[(44, 4756.8899999999985), (35, 5155.419999999999), (2, 5994.59)]

In [54]:
results = totalByCustomer.collect()
for result in results:
    print(result)

(44, 4756.8899999999985)
(35, 5155.419999999999)
(2, 5994.59)
(47, 4316.299999999999)
(29, 5032.529999999999)
(91, 4642.259999999999)
(70, 5368.249999999999)
(85, 5503.43)
(53, 4945.299999999999)
(14, 4735.030000000001)
(51, 4975.22)
(42, 5696.840000000003)
(79, 3790.570000000001)
(50, 4517.27)
(20, 4836.859999999999)
(15, 5413.510000000001)
(5, 4561.069999999999)
(48, 4384.33)
(31, 4765.05)
(4, 4815.050000000002)
(36, 4278.049999999997)
(57, 4628.4)
(12, 4664.589999999998)
(22, 5019.449999999999)
(54, 6065.389999999999)
(0, 5524.949999999998)
(88, 4830.549999999999)
(86, 4908.81)
(13, 4367.62)
(40, 5186.429999999999)
(98, 4297.260000000001)
(55, 5298.090000000002)
(95, 4876.840000000002)
(61, 5497.479999999998)
(27, 4915.889999999999)
(78, 4524.509999999999)
(83, 4635.799999999997)
(6, 5397.879999999998)
(26, 5250.4)
(75, 4178.500000000001)
(25, 5057.610000000001)
(71, 5995.660000000003)
(39, 6193.109999999999)
(60, 5040.709999999999)
(97, 5977.189999999995)
(7, 4755.070000000001)
(21

In [55]:
sc.stop()

In [56]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('SpendByCustomerSorted')
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

lines = sc.textFile('customer-orders.csv')
rdd = lines.map(parseLine)

In [57]:
rdd.take(3)

[(44, 37.19), (35, 65.89), (2, 40.64)]

In [59]:
totalByCustomer = rdd.reduceByKey(lambda x, y: x + y)
totalByCustomer.take(3)

[(44, 4756.8899999999985), (35, 5155.419999999999), (2, 5994.59)]

In [61]:
flipped = totalByCustomer.map(lambda x: (x[1], x[0]))
flipped.take(3)

[(4756.8899999999985, 44), (5155.419999999999, 35), (5994.59, 2)]

In [62]:
totalByCustomerSorted = flipped.sortByKey()
totalByCustomerSorted.take(3)

[(3309.38, 45), (3790.570000000001, 79), (3924.230000000001, 96)]

In [63]:
results = totalByCustomerSorted.collect()
for result in results:
    print(result)

(3309.38, 45)
(3790.570000000001, 79)
(3924.230000000001, 96)
(4042.6499999999987, 23)
(4172.289999999998, 99)
(4178.500000000001, 75)
(4278.049999999997, 36)
(4297.260000000001, 98)
(4316.299999999999, 47)
(4327.729999999999, 77)
(4367.62, 13)
(4384.33, 48)
(4394.599999999999, 49)
(4475.569999999999, 94)
(4505.79, 67)
(4517.27, 50)
(4524.509999999999, 78)
(4561.069999999999, 5)
(4628.4, 57)
(4635.799999999997, 83)
(4642.259999999999, 91)
(4647.129999999999, 74)
(4652.939999999999, 84)
(4659.63, 3)
(4664.589999999998, 12)
(4681.919999999999, 66)
(4701.019999999999, 56)
(4707.41, 21)
(4727.860000000001, 80)
(4735.030000000001, 14)
(4735.200000000002, 37)
(4755.070000000001, 7)
(4756.8899999999985, 44)
(4765.05, 31)
(4812.489999999998, 82)
(4815.050000000002, 4)
(4819.700000000001, 10)
(4830.549999999999, 88)
(4836.859999999999, 20)
(4851.479999999999, 89)
(4876.840000000002, 95)
(4898.460000000002, 38)
(4904.209999999999, 76)
(4908.81, 86)
(4915.889999999999, 27)
(4921.27, 18)
(4945.299

In [100]:
sc.stop()

In [102]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('RatingsHistogram')
sc = SparkContext(conf = conf)

lines = sc.textFile('u.data')
lines.take(3)

['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']

In [103]:
ratings = lines.map(lambda x: x.split()[2])
ratings.take(3)

['3', '3', '1']

In [105]:
result = ratings.countByValue()
result

defaultdict(int, {'1': 6110, '2': 11370, '3': 27145, '4': 34174, '5': 21201})

In [108]:
import collections
sortedResults = collections.OrderedDict(sorted(result.items()))
for key, value in sortedResults.items():
    print('%s %i' % (key, value))

1 6110
2 11370
3 27145
4 34174
5 21201


In [109]:
sc.stop()

In [112]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('PopularMovies')
sc = SparkContext(conf = conf)

lines = sc.textFile('u.data')
lines.take(3)

['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']

In [122]:
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movies.take(3)

[(242, 1), (302, 1), (377, 1)]

In [123]:
movieCounts = movies.reduceByKey(lambda x, y: x + y)
movieCounts.take(3)

[(242, 117), (302, 297), (377, 13)]

In [130]:
flipped = movieCounts.map(lambda x: (x[1], x[0]))
flipped.take(3)

[(117, 242), (297, 302), (13, 377)]

In [131]:
sortedMovies = flipped.sortByKey()
sortedMovies.take(3)

[(1, 1348), (1, 1320), (1, 1492)]

In [132]:
results = sortedMovies.collect()
for result in results:
    print(result)

(1, 1348)
(1, 1320)
(1, 1492)
(1, 1364)
(1, 1493)
(1, 830)
(1, 1498)
(1, 814)
(1, 1520)
(1, 711)
(1, 1373)
(1, 1309)
(1, 857)
(1, 1236)
(1, 1310)
(1, 1536)
(1, 1582)
(1, 1343)
(1, 1457)
(1, 1543)
(1, 599)
(1, 1458)
(1, 1561)
(1, 1533)
(1, 1565)
(1, 1563)
(1, 1156)
(1, 1505)
(1, 852)
(1, 1557)
(1, 1562)
(1, 1586)
(1, 1476)
(1, 1580)
(1, 1363)
(1, 1339)
(1, 1566)
(1, 1349)
(1, 1447)
(1, 1235)
(1, 1587)
(1, 677)
(1, 1571)
(1, 1575)
(1, 1510)
(1, 1579)
(1, 1603)
(1, 1616)
(1, 1526)
(1, 1596)
(1, 1453)
(1, 1461)
(1, 1559)
(1, 1507)
(1, 1593)
(1, 1576)
(1, 1525)
(1, 1569)
(1, 1568)
(1, 1340)
(1, 1619)
(1, 1601)
(1, 1583)
(1, 1624)
(1, 1651)
(1, 1414)
(1, 1486)
(1, 1614)
(1, 1570)
(1, 1599)
(1, 1649)
(1, 1572)
(1, 1653)
(1, 1452)
(1, 1595)
(1, 1548)
(1, 1655)
(1, 1654)
(1, 1482)
(1, 1657)
(1, 1650)
(1, 1660)
(1, 1661)
(1, 1515)
(1, 1621)
(1, 1632)
(1, 1618)
(1, 1647)
(1, 1581)
(1, 1584)
(1, 1669)
(1, 1613)
(1, 1130)
(1, 1663)
(1, 1634)
(1, 1606)
(1, 1329)
(1, 1494)
(1, 1673)
(1, 1633)
(1, 167

In [137]:
sc.stop()

In [138]:
from pyspark import SparkConf, SparkContext

def loadMovieNames():
    movieNames = {}
    with open('u.item', encoding = "ISO-8859-1") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

conf = SparkConf().setMaster('local').setAppName('PopularMovies')
sc = SparkContext(conf = conf)
nameDict = sc.broadcast(loadMovieNames())

In [141]:
lines = sc.textFile('u.data')
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movies.take(3)

[(242, 1), (302, 1), (377, 1)]

In [143]:
movieCounts = movies.reduceByKey(lambda x, y:x + y )
movieCounts.take(3)

[(242, 117), (302, 297), (377, 13)]

In [145]:
flipped = movieCounts.map(lambda x: (x[1], x[0]))
flipped.take(3)

[(117, 242), (297, 302), (13, 377)]

In [146]:
sortedMovies = flipped.sortByKey()
sortedMovies.take(3)

[(1, 1348), (1, 1320), (1, 1492)]