In [1]:
import pyspark
from pyspark.sql import SparkSession
sc = pyspark.SparkContext('local[*]')
spark = SparkSession.builder.appName("csvTest").getOrCreate()

In [1]:
from operator import add
from time import time

In [2]:
!wget https://raw.githubusercontent.com/fivethirtyeight/data/master/daily-show-guests/daily_show_guests.csv --no-check-certificate

--2017-12-19 14:10:48--  https://raw.githubusercontent.com/fivethirtyeight/data/master/daily-show-guests/daily_show_guests.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 126723 (124K) [text/plain]
Saving to: ‘daily_show_guests.csv’


2017-12-19 14:10:49 (536 KB/s) - ‘daily_show_guests.csv’ saved [126723/126723]



In [5]:
%ls

daily_show_guests.csv  Lab 7.ipynb  [0m[01;34mspark-warehouse[0m/


In [141]:
raw_data = sc.textFile("daily_show_guests.csv")
raw_data.take(5)

['YEAR,GoogleKnowlege_Occupation,Show,Group,Raw_Guest_List',
 '1999,actor,1/11/99,Acting,Michael J. Fox',
 '1999,Comedian,1/12/99,Comedy,Sandra Bernhard',
 '1999,television actress,1/13/99,Acting,Tracey Ullman',
 '1999,film actress,1/14/99,Acting,Gillian Anderson']

In [142]:
raw_data.count()

2694

In [172]:
def lineSplit(line):
    res = line.split(",")
    while len(res) >= 6:
        res[4]=res[4]+","+res[5]
        del res[5]
    return res

In [175]:
daily_show = raw_data.map(lambda line: lineSplit(line))

In [176]:
daily_show.take(5)

[['YEAR', 'GoogleKnowlege_Occupation', 'Show', 'Group', 'Raw_Guest_List'],
 ['1999', 'actor', '1/11/99', 'Acting', 'Michael J. Fox'],
 ['1999', 'Comedian', '1/12/99', 'Comedy', 'Sandra Bernhard'],
 ['1999', 'television actress', '1/13/99', 'Acting', 'Tracey Ullman'],
 ['1999', 'film actress', '1/14/99', 'Acting', 'Gillian Anderson']]

In [177]:
daily_show.count()

2694

In [163]:
daily_show.filter(lambda x: x[4]=="\"Hootie & the Blowfish, Billy Crystal\"").take(5)

[['1999',
  'rock band',
  '3/11/99',
  'Musician',
  '"Hootie & the Blowfish, Billy Crystal"'],
 ['1999',
  'actor',
  '3/11/99',
  'Acting',
  '"Hootie & the Blowfish, Billy Crystal"']]

In [16]:
tally = daily_show.map(lambda x: (x[0], 1)).reduceByKey(add)

In [185]:
for kv in tally.take(5):
    print(kv)

('YEAR', 1)
('2002', 159)
('2003', 166)
('2004', 164)
('2007', 141)


In [118]:
def filter_year(line):
    return not("YEAR" in line[0])

In [178]:
filtered_daily_show = daily_show.filter(lambda line: filter_year(line))

In [179]:
filtered_daily_show.take(5)

[['1999', 'actor', '1/11/99', 'Acting', 'Michael J. Fox'],
 ['1999', 'Comedian', '1/12/99', 'Comedy', 'Sandra Bernhard'],
 ['1999', 'television actress', '1/13/99', 'Acting', 'Tracey Ullman'],
 ['1999', 'film actress', '1/14/99', 'Acting', 'Gillian Anderson'],
 ['1999', 'actor', '1/18/99', 'Acting', 'David Alan Grier']]

In [168]:
filtered_daily_show.count()

2693

In [134]:
filtered_daily_show.map(lambda x: (str.lower(x[1]), 1)).reduceByKey(add).take(10)

[('actor', 596),
 ('film actress', 21),
 ('model', 9),
 ('stand-up comedian', 44),
 ('actress', 271),
 ('television personality', 13),
 ('comic', 2),
 ('musician', 19),
 ('film actor', 19),
 ('journalist', 253)]

In [112]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [164]:
schema = StructType([
    StructField("YEAR", StringType(), True),
    StructField("GoogleKnowlege_Occupation", StringType(), True),
    StructField("Show", StringType(), True),
    StructField("Group", StringType(), True),
    StructField("Raw_Guest_List", StringType(), True)])

In [184]:
udf = spark.createDataFrame(filtered_daily_show, schema=schema)
udf.show()

+----+-------------------------+--------+--------+--------------------+
|YEAR|GoogleKnowlege_Occupation|    Show|   Group|      Raw_Guest_List|
+----+-------------------------+--------+--------+--------------------+
|1999|                    actor| 1/11/99|  Acting|      Michael J. Fox|
|1999|                 Comedian| 1/12/99|  Comedy|     Sandra Bernhard|
|1999|       television actress| 1/13/99|  Acting|       Tracey Ullman|
|1999|             film actress| 1/14/99|  Acting|    Gillian Anderson|
|1999|                    actor| 1/18/99|  Acting|    David Alan Grier|
|1999|                    actor| 1/19/99|  Acting|     William Baldwin|
|1999|          Singer-lyricist| 1/20/99|Musician|       Michael Stipe|
|1999|                    model| 1/21/99|   Media|      Carmen Electra|
|1999|                    actor| 1/25/99|  Acting|     Matthew Lillard|
|1999|        stand-up comedian| 1/26/99|  Comedy|         David Cross|
|1999|                  actress| 1/27/99|  Acting|      Yasmine 

In [182]:
udf.groupBy("GoogleKnowlege_Occupation").count().show(10)

+-------------------------+-----+
|GoogleKnowlege_Occupation|count|
+-------------------------+-----+
|        singer-songwriter|   19|
|     former white hous...|    8|
|        us representative|    9|
|     former governor o...|    1|
|          race car driver|    1|
|     Associate Justice...|    1|
|                 diplomat|    6|
|              Commentator|    4|
|     First Minister of...|    1|
|     former governor o...|    3|
+-------------------------+-----+
only showing top 10 rows



In [110]:
!cat daily_show_guests.csv

YEAR,GoogleKnowlege_Occupation,Show,Group,Raw_Guest_List
1999,actor,1/11/99,Acting,Michael J. Fox
1999,Comedian,1/12/99,Comedy,Sandra Bernhard
1999,television actress,1/13/99,Acting,Tracey Ullman
1999,film actress,1/14/99,Acting,Gillian Anderson
1999,actor,1/18/99,Acting,David Alan Grier
1999,actor,1/19/99,Acting,William Baldwin
1999,Singer-lyricist,1/20/99,Musician,Michael Stipe
1999,model,1/21/99,Media,Carmen Electra
1999,actor,1/25/99,Acting,Matthew Lillard
1999,stand-up comedian,1/26/99,Comedy,David Cross
1999,actress,1/27/99,Acting,Yasmine Bleeth
1999,actor,1/28/99,Acting,D. L. Hughley
1999,television actress,10/18/99,Acting,Rebecca Gayheart
1999,Comedian,10/19/99,Comedy,Steven Wright
1999,actress,10/20/99,Acting,Amy Brenneman
1999,actress,10/21/99,Acting,Melissa Gilbert
1999,actress,10/25/99,Acting,Cathy Moriarty
1999,comedian,10/26/99,Comedy,Louie Anderson
1999,actress,10/27/99,Acting,Sarah Michelle Gellar
1999,Singer-songwriter,10/28/99,Musician,Melanie C
1

In [173]:
testStr = sc.parallelize(["1999,actress,2/10/99,Acting,\"Pamela Anderson, Natalie Raitano, Molly Culver\""])
testStr.collect()

['1999,actress,2/10/99,Acting,"Pamela Anderson, Natalie Raitano, Molly Culver"']

In [174]:
strRDD= testStr.map(lambda x: lineSplit(x))
strRDD.collect()

[['1999',
  'actress',
  '2/10/99',
  'Acting',
  '"Pamela Anderson, Natalie Raitano, Molly Culver"']]

In [159]:
strRDD.filter(lambda x: x[4]=="\"Hootie & the Blowfish, Billy Crystal\"").collect()

[['1999',
  'rock band',
  '3/11/99',
  'Musician',
  '"Hootie & the Blowfish, Billy Crystal"']]