In [1]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.0.1'
#spark_version = 'spark-3.0.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Co                                                                               Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to ppa.launch0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Connecting to ppa.launchpa                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu180

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [3]:
#Load training dataset 

train_df = spark.read.csv("./train.csv", inferSchema= True, header = True)

In [4]:
train_df.show()

+---+-------+--------+--------------------+------+
| id|keyword|location|                text|target|
+---+-------+--------+--------------------+------+
|  1|   null|    null|Our Deeds are the...|     1|
|  4|   null|    null|Forest fire near ...|     1|
|  5|   null|    null|All residents ask...|     1|
|  6|   null|    null|13,000 people rec...|     1|
|  7|   null|    null|Just got sent thi...|     1|
|  8|   null|    null|#RockyFire Update...|     1|
| 10|   null|    null|#flood #disaster ...|     1|
| 13|   null|    null|I'm on top of the...|     1|
| 14|   null|    null|There's an emerge...|     1|
| 15|   null|    null|I'm afraid that t...|     1|
| 16|   null|    null|Three people died...|     1|
| 17|   null|    null|Haha South Tampa ...|     1|
| 18|   null|    null|#raining #floodin...|     1|
| 19|   null|    null|#Flood in Bago My...|     1|
| 20|   null|    null|Damage to school ...|     1|
| 23|   null|    null|      What's up man?|     0|
| 24|   null|    null|       I 

In [5]:
train_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)
 |-- target: integer (nullable = true)



In [6]:
train_df.describe()

DataFrame[summary: string, id: string, keyword: string, location: string, text: string, target: string]

In [7]:
#Remove extraneous columns

trainer_df = train_df.select("id", "text", "target")

trainer_df.show()

+---+--------------------+------+
| id|                text|target|
+---+--------------------+------+
|  1|Our Deeds are the...|     1|
|  4|Forest fire near ...|     1|
|  5|All residents ask...|     1|
|  6|13,000 people rec...|     1|
|  7|Just got sent thi...|     1|
|  8|#RockyFire Update...|     1|
| 10|#flood #disaster ...|     1|
| 13|I'm on top of the...|     1|
| 14|There's an emerge...|     1|
| 15|I'm afraid that t...|     1|
| 16|Three people died...|     1|
| 17|Haha South Tampa ...|     1|
| 18|#raining #floodin...|     1|
| 19|#Flood in Bago My...|     1|
| 20|Damage to school ...|     1|
| 23|      What's up man?|     0|
| 24|       I love fruits|     0|
| 25|    Summer is lovely|     0|
| 26|   My car is so fast|     0|
| 28|What a goooooooaa...|     0|
+---+--------------------+------+
only showing top 20 rows



In [8]:
trainer_df = trainer_df.dropna()

In [9]:
from pyspark.sql.functions import col

trainer_df = trainer_df.withColumn("label",col('target'))

In [14]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
#pos_neg_to_num = StringIndexer(inputCol='class',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [15]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token'], outputCol='features')

In [16]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

In [17]:
# Fit and transform the pipeline
cleaner_train = data_prep_pipeline.fit(trainer_df)
cleaned_train = cleaner_train.transform(trainer_df)

In [18]:
# Show label and resulting features
cleaned_train.show()

+---+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|                text|target|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+---+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1|Our Deeds are the...|     1|    1|[our, deeds, are,...|[deeds, reason, #...|(262144,[24370,35...|(262144,[24370,35...|(262144,[24370,35...|
|  4|Forest fire near ...|     1|    1|[forest, fire, ne...|[forest, fire, ne...|(262144,[55310,72...|(262144,[55310,72...|(262144,[55310,72...|
|  5|All residents ask...|     1|    1|[all, residents, ...|[residents, asked...|(262144,[38983,70...|(262144,[38983,70...|(262144,[38983,70...|
|  6|13,000 people rec...|     1|    1|[13,000, people, ...|[13,000, people, ...|(262144,[38983,11...|(262144,[38983,11...|(262144

In [19]:
from pyspark.ml.classification import NaiveBayes

# Break data down into a training set and a testing set
training, testing = cleaned_train.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [20]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   id|                text|target|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1|Our Deeds are the...|     1|    1|[our, deeds, are,...|[deeds, reason, #...|(262144,[24370,35...|(262144,[24370,35...|(262144,[24370,35...|[-490.47440244635...|[1.05211953043298...|       1.0|
|10005|  All of this energy|     0|    0|[all, of, this, e...|            [energy]|(262144,[109955],...|(262144,[109955],...|(262144,[109955],...|[-73.513936482542...|[0.98512631121248...|       0.0|


In [21]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting disaster tweets was: %f" % acc)

Accuracy of model at predicting disaster tweets was: 0.769555
