In [1]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf,col
import os
# tools
import re
import math
import json
import requests
import itertools
import numpy as np
import pandas as pd
import time
from datetime import datetime, timedelta
import string

In [2]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.evaluation import ClusteringEvaluator

In [3]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark Naive Bayes") \
        .getOrCreate()
    return spark
spark = init_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/27 22:20:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
'''
Read Lemma data
'''
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType

data = spark.read.csv("../data/lemma.csv", header=True)
function_array = udf(lambda r: r.split("|"), ArrayType(StringType()))
function_toNumerical = udf(lambda r: int(r), IntegerType())
text_lemmas = data.withColumn('finished_lemmas', function_array('text')).drop('text').withColumn('label', function_toNumerical('label'))
print(text_lemmas.count())
text_lemmas.show()

                                                                                

14827


[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+-----+--------------------+
|                  id|label|     finished_lemmas|
+--------------------+-----+--------------------+
|RecognitionBasic9521|    0|[Chart, look, goo...|
|            whttevrr|    0|[Low, float, coul...|
|      CuriousDev1012|    0|[Hey, All, Ive, b...|
|         Marketspike|    0|[Solar, Integrate...|
|    bananahammock699|    0|[Positive, earnin...|
|          Mathew_177|    0|[BreakingUranium,...|
|          kidicaru59|    0|[This, article, b...|
|        OffceFnactic|    0|[Think, MMA, be, ...|
|yellowbrickroad2YOLO|    0|[TDR, TripAdvisor...|
|          wujianmiao|    0|[harmonic, resist...|
|   Brilliant-Key8466|    0|[Hey, guy, In, da...|
|   blue_tailed_skink|    0|[Taken, from, a, ...|
|           lovelyhug|    0|[Which, dividend,...|
|    throwawaymyalias|    0|[cent, stock, M, ...|
|     rupturedspeaker|    0|[I, have, all, of...|
|      nateschillings|    0|[American, Virtua...|
|          HoffmanGuy|    0|[Just, say, I, be...|


                                                                                

In [5]:
'''
Get the Corpus.
Removing stop words from the text lemmas. 
'''
remover = StopWordsRemover(inputCol="finished_lemmas", outputCol="text")
filtered_df = remover.transform(text_lemmas)
filtered_df.show()

+--------------------+-----+--------------------+--------------------+
|                  id|label|     finished_lemmas|                text|
+--------------------+-----+--------------------+--------------------+
|RecognitionBasic9521|    0|[Chart, look, goo...|[Chart, look, goo...|
|            whttevrr|    0|[Low, float, coul...|[Low, float, beco...|
|      CuriousDev1012|    0|[Hey, All, Ive, b...|[Hey, Ive, code, ...|
|         Marketspike|    0|[Solar, Integrate...|[Solar, Integrate...|
|    bananahammock699|    0|[Positive, earnin...|[Positive, earnin...|
|          Mathew_177|    0|[BreakingUranium,...|[BreakingUranium,...|
|          kidicaru59|    0|[This, article, b...|[article, bit, da...|
|        OffceFnactic|    0|[Think, MMA, be, ...|[Think, MMA, real...|
|yellowbrickroad2YOLO|    0|[TDR, TripAdvisor...|[TDR, TripAdvisor...|
|          wujianmiao|    0|[harmonic, resist...|[harmonic, resist...|
|   Brilliant-Key8466|    0|[Hey, guy, In, da...|[Hey, guy, day, S...|
|   bl

In [6]:
'''
HashingTF: Create Document-Term Matrix
'''
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

hashingTF = HashingTF(inputCol="text", outputCol="rawFeatures", numFeatures=50)
featurizedData = hashingTF.transform(filtered_df)

featurizedData.show(truncate=False)

[Stage 6:>                                                          (0 + 1) / 1]

22/08/27 22:20:45 WARN PythonUDFRunner: Detected deadlock while completing task 0.0 in stage 6 (TID 7): Attempting to kill Python Worker
+--------------------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [7]:
'''
TF-IDF, followed from HashingTF.
'''
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

                                                                                

In [8]:
selectedData = rescaledData.select('id', 'label','features', 'text')
selectedData.show(truncate=False)

+--------------------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
'''
Separate data into training/test used for Naive-Bayes
'''
training_zero, test_zero = selectedData.where(selectedData.label == 0).randomSplit([0.7, 0.3])
training_one, test_one = selectedData.where(selectedData.label == 1).randomSplit([0.7, 0.3])

training = training_zero.union(training_one)
test = test_zero.union(test_one)
training.show()
# should be 70% of total in training, 30% in test
print("Total data count:", selectedData.count())
print("Total count of >1.4%", training.count())
print("Total count of <1.4", test.count())

                                                                                

+-------------------+-----+--------------------+--------------------+
|                 id|label|            features|                text|
+-------------------+-----+--------------------+--------------------+
|         -ApeBrain-|    0|(50,[6,14,15,16,2...|[move, Puerto, Ri...|
|    -Billy_Butcher-|    0|(50,[3,7,11,13,14...|[Big, news, quest...|
|       -Perspective|    0|(50,[1,3,4,6,7,8,...|[see, PGM, lacklu...|
|      -RogueShadow-|    0|(50,[0,3,7,8,13,2...|[Looks, like, ann...|
|           -_PURE_-|    0|(50,[0,1,2,3,4,5,...|[Preface, Chances...|
|          -dtstocks|    0|(50,[1,5,6,9,10,1...|[Lots, volume, to...|
|         10xwannabe|    0|(50,[0,1,2,3,4,5,...|[Released, today,...|
|         11AllSkill|    0|(50,[7,12,13,19,2...|[want, ask, many,...|
|         1827338989|    0|(50,[0,3,8,15,19,...|[Gold, stock, gro...|
|      1evolvedchimp|    0|(50,[2,6,11,12,19...|[really, show, li...|
|     2001Andrew2001|    0|(50,[5,10,13,17,3...|[people, find, pe...|
|         24flinchin

                                                                                

Total count of >1.4% 10494




Total count of <1.4 4333


                                                                                

In [10]:
'''
Naive-Bayes following from TF-IDF
'''
def NAIVEBAYES_HASH(smooth=0, model_type=0):
  # separating training/test sets
  training_zero, test_zero = selectedData.where(selectedData.label == 0).randomSplit([0.7, 0.3])
  training_one, test_one = selectedData.where(selectedData.label == 1).randomSplit([0.7, 0.3])

  training = training_zero.union(training_one)
  test = test_zero.union(test_one)
  #training.show()

  # print("Total data count:", selectedData.count())
  # print("Training data count:", training.count())
  # print("Testing data count:", test.count())
  # print("------------------------------------")
  # print("Total training count of >6%", training.where(selectedData.label == 0).count())
  # print("Total training count of <6%", training.where(selectedData.label == 1).count())
  # print("------------------------------------")
  # print("Total test count of >6%", test.where(selectedData.label == 0).count())
  # print("Total test count of <6%", test.where(selectedData.label == 1).count())

  # create trainer with parameters then train
  # smoothing: smooth probabilities of 0 to the input
  nb = NaiveBayes(smoothing=smooth, modelType=model_type)
  model_NB = nb.fit(training)

  # display on test set: appends a prediction column
  predictions = model_NB.transform(test)
  #predictions.show()

  # compute accuracy of on test set: compares labelCol and predictionCol
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)
  #print('Model accuracy:', accuracy)
  return accuracy

In [11]:
import statistics
import random

extract_method = "HashingTF"
iter_each = 10
iter_total = 50
m_types = ["complement", "multinomial"]
means = []

for model_type in m_types:
  for k in range(iter_total):
    accuracies = []
    smoothing = random.uniform(0.01, 0.8)
    for i in range(iter_each):
      accuracies.append(NAIVEBAYES_HASH(smoothing, model_type))
    mean = statistics.mean(accuracies)
    print("=> Mean:", mean, "- Smoothing:", smoothing, "- Model:", model_type)
    means.append((mean, smoothing, model_type, extract_method))

                                                                                

22/08/27 22:21:11 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

=> Mean: 0.48053296407405055 - Smoothing: 0.7996554885001081 - Model: complement


ERROR:root:KeyboardInterrupt while sending command.                 (3 + 3) / 6]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
                                                                                

KeyboardInterrupt: 

In [None]:
from pyspark.sql.types import FloatType
acc_df = pd.DataFrame(means, columns=['mean', 'smoothing', 'model_type', 'extract_method'])
acc_df.to_csv("means_hash.csv")

22/08/27 07:23:28 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 901523 ms exceeds timeout 120000 ms
22/08/27 07:23:28 WARN SparkContext: Killing executors is not supported by current scheduler.
