In [2]:
""" PySpark setup """

# Find the latest version of spark 3.0 from http://www-us.apache.org/dist/spark/ and enter as the spark version environment variable
import os
spark_version = 'spark-3.0.2'
os.environ['SPARK_VERSION'] = spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set environment variables
os.environ["JAVA_HOME"] = '/usr/lib/jvm/java-11-openjdk-amd64'
os.environ["SPARK_HOME"] = f'/content/{spark_version}-bin-hadoop2.7'

# Locate Spark
import findspark
findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/comp

In [3]:
# Dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, length
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Spark session
spark = SparkSession.builder.appName('app').getOrCreate()
spark

In [5]:
""" Text data """

# Create df
text_df = spark.createDataFrame([
  (0, 'Spark is great'),
  (1, 'We are learning Spark'),
  (2, 'Spark is better than Hadoop no doubt')
], ['id', 'sentence'])
text_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1|We are learning S...|
|  2|Spark is better t...|
+---+--------------------+



In [6]:
""" NLP pipeline """

# Add a col for word count
count_words = udf(lambda sentence: len(sentence.split()), IntegerType()) # udf to count words
text_df = text_df.withColumn('word_count', count_words(col('sentence')))

# Add a col for words
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
text_df = tokenizer.transform(text_df)

# Add a col that filters out stop words
stop_remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')
text_df = stop_remover.transform(text_df)

# Add a col for term frequency
tf = HashingTF(inputCol='filtered_words', outputCol='hashed_values')
text_df = tf.transform(text_df)

# Add a col that scales the term frequency by the inverse document frequency
idf = IDF(inputCol='hashed_values', outputCol='tf_idf')
text_df = idf.fit(text_df).transform(text_df)

text_df.show(truncate=False)

+---+------------------------------------+----------+--------------------------------------------+------------------------------+--------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|id |sentence                            |word_count|words                                       |filtered_words                |hashed_values                                           |tf_idf                                                                                               |
+---+------------------------------------+----------+--------------------------------------------+------------------------------+--------------------------------------------------------+-----------------------------------------------------------------------------------------------------+
|0  |Spark is great                      |3         |[spark, is, great]                          |[spark, great]                |(262

In [7]:
""" Airline tweets data """

# Add airlines data
airlines_url = 'https://s3.amazonaws.com/dataviz-curriculum/day_2/airlines.csv'
spark.sparkContext.addFile(airlines_url)

# Read in airlines data
airlines_df = spark.read.csv(SparkFiles.get('airlines.csv'), header=True)
airlines_df = airlines_df.withColumnRenamed('Airline Tweets', 'tweet') # rename col
airlines_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+
|tweet                                                                                                                                  |
+---------------------------------------------------------------------------------------------------------------------------------------+
|@VirginAmerica plus you've added commercials to the experience... tacky.                                                               |
|@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing. it's really the only bad thing about flying VA|
|@VirginAmerica do you miss me? Don't worry we'll be together very soon.                                                                |
|@VirginAmerica Are the hours of operation for the Club at SFO that are posted online current?                                          |
|@VirginAmerica awaiting my return

In [8]:
""" NLP pipeline """

# Word count
count_words = udf(lambda text: len(text.split()), IntegerType())
airlines_df = airlines_df.withColumn('word_count', count_words(col('tweet')))

# Words
tokenizer = Tokenizer(inputCol='tweet', outputCol='words')
airlines_df = tokenizer.transform(airlines_df)

# Filtered words
stop_remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')
airlines_df = stop_remover.transform(airlines_df)

# Hashed valued
tf = HashingTF(inputCol='filtered_words', outputCol='hashed_values')
airlines_df = tf.transform(airlines_df)

# TF-IDF
idf = IDF(inputCol='hashed_values', outputCol='tf_idf')
airlines_df = idf.fit(airlines_df).transform(airlines_df)

airlines_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tweet                                                                                                                                  |word_count|wo

In [14]:
""" Yelp reviews data """

# Add yelp data
yelp_url = 'https://s3.amazonaws.com/dataviz-curriculum/day_2/yelp_reviews.csv'
spark.sparkContext.addFile(yelp_url)

# Read in yelp data
yelp_df = spark.read.csv(SparkFiles.get('yelp_reviews.csv'), header=True)

# Add word count
count_words = udf(lambda txt: len(txt.split()), IntegerType())
yelp_df = yelp_df.withColumn('length', count_words(yelp_df['text']))

yelp_df.show(5, truncate=False)

+--------+---------------------------------------------------------------------------------------+------+
|class   |text                                                                                   |length|
+--------+---------------------------------------------------------------------------------------+------+
|positive|Wow... Loved this place.                                                               |4     |
|negative|Crust is not good.                                                                     |4     |
|negative|Not tasty and the texture was just nasty.                                              |8     |
|positive|Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.|15    |
|positive|The selection on the menu was great and so were the prices.                            |12    |
+--------+---------------------------------------------------------------------------------------+------+
only showing top 5 rows



In [15]:
""" NLP pipeline """

# Transformers
tokenizer = Tokenizer(inputCol='text', outputCol='tokens') # words
stop_remover = StopWordsRemover(inputCol='tokens', outputCol='words') # non-stopword words
tf = HashingTF(inputCol='words', outputCol='hash') # hashed tf
idf = IDF(inputCol='hash', outputCol='tf_idf') # tf-idf
str_indexer = StringIndexer(inputCol='class', outputCol='label') # numeric target label
featurizer = VectorAssembler(inputCols=['length', 'tf_idf'], outputCol='features') # feature set

# Pipeline
pipeline = Pipeline(stages=[tokenizer, stop_remover, tf, idf, str_indexer, featurizer])
yelp_transformed = pipeline.fit(yelp_df).transform(yelp_df)

# Show transformed data
yelp_transformed.show(5, truncate=False)
yelp_transformed.select(['label', 'features']).show(5, truncate=False)

+--------+---------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|class   |text                                                  

In [16]:
""" ML """

# Train/test split
train, test = yelp_transformed.randomSplit([0.8, 0.2], seed=0)

# Naive bayes classifier
nb = NaiveBayes(featuresCol='features', labelCol='label')
nb_fitted = nb.fit(train) # fit on train set
test_pred = nb_fitted.transform(test) # predict on test set

# Evaluate predictions
evalulator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') # default metric: area under ROC
auc = evalulator.evaluate(test_pred) # ROC-AUC score

# Show predictions
test_pred.show(5, truncate=False)
test_pred.select(['label', 'prediction']).show()
print('ROC-AUC score (Area under ROC): ', auc)

+--------+-----------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------