In [18]:
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

In [39]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('spark://spark:7077')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")



In [40]:
training_data = spark.read.csv('train.csv',inferSchema=True,header=True)

In [41]:
training_data.show()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
|  6|    0|[2/2] huge fan fa...|
|  7|    0| @user camping to...|
|  8|    0|the next school y...|
|  9|    0|we won!!! love th...|
| 10|    0| @user @user welc...|
| 11|    0| â #ireland con...|
| 12|    0|we are so selfish...|
| 13|    0|i get to see my d...|
| 14|    1|@user #cnn calls ...|
| 15|    1|no comment!  in #...|
| 16|    0|ouch...junior is ...|
| 17|    0|i am thankful for...|
| 18|    1|retweet if you ag...|
| 19|    0|its #friday! ð...|
| 20|    0|as we all know, e...|
+---+-----+--------------------+
only showing top 20 rows



In [42]:
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet',r'http\S+',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','@\w+',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','#',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','RT',''))

training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&lt',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&gt',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&amp',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&quot',''))

training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','-',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet',' ',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','  ',''))

In [43]:
(train_set, val_set, test_set) = training_data.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [2]:
train_set.show()

NameError: name 'train_set' is not defined

In [44]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [45]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labell")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(training_data)
train_df = pipelineFit.transform(training_data)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+---+-----+--------------------+--------------------+--------------------+--------------------+------+
| id|label|               tweet|               words|                  tf|            features|labell|
+---+-----+--------------------+--------------------+--------------------+--------------------+------+
|  1|    0|whenafatherisdysf...|[whenafatherisdys...|(65536,[32317],[1...|(65536,[32317],[0...|   0.0|
|  2|    0|thanksforlyftcred...|[thanksforlyftcre...|(65536,[13321],[1...|(65536,[13321],[0...|   0.0|
|  3|    0|   bihdayyourmajesty| [bihdayyourmajesty]|(65536,[12599],[1...|(65536,[12599],[0...|   0.0|
|  4|    0|modeliloveutakewi...|[modeliloveutakew...|(65536,[37108],[1...|(65536,[37108],[4...|   0.0|
|  5|    0|factsguide:societ...|[factsguide:socie...|(65536,[7220],[1.0])|(65536,[7220],[0.0])|   0.0|
+---+-----+--------------------+--------------------+--------------------+--------------------+------+
only showing top 5 rows



In [46]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.5773375043448037

In [47]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
accuracy

0.9355932203389831

In [None]:
#CountVectorizer + IDF + Logistic Regression

In [50]:
%%time
from pyspark.ml.feature import CountVectorizer

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labell")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9288
ROC-AUC: 0.5274
CPU times: total: 46.9 ms
Wall time: 10.4 s


# SparkStreaming

In [69]:
from kafka import KafkaConsumer

In [70]:
#import pydoop.hdfs as hdfs
consumer = KafkaConsumer('covid',bootstrap_servers=['localhost:9092'])

In [192]:
import pandas as pd
vals = [(2,"", 0)]
columns = ['tweet','id', 'label']
df=spark.createDataFrame(vals,columns)

for message in consumer:
  for i in range(20):  
   print(i) 
   values = message.value.decode('utf-8').splitlines()
   dfObj = pd.DataFrame(values,columns = ['tweet']) 
   dfObj['id']=dfObj.index 
   dfObj['label']=dfObj.index
   dfObj.reset_index(drop=True) 
   dfObj=spark.createDataFrame(dfObj,columns) 
    
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet',r'http\S+',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','@\w+',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','#',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','RT',''))

   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&lt',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&gt',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&amp',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&quot',''))

   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','-',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet',' ',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','  ',''))
   print(dfObj.show) 
   df = df.union(dfObj)
  break
   

0
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
1
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
2
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
3
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
4
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
5
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
6
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
7
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
8
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
9
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
10
<bound method DataFrame.show of DataFrame[tweet: string, id: bigint, label: bigint]>
11
<bound method DataFrame.show of DataFra

In [194]:
d=df.select("tweet","id","label")

In [196]:
d.show()

Py4JJavaError: An error occurred while calling o152912.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 365.0 failed 1 times, most recent failure: Lost task 0.0 in stage 365.0 (TID 355, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 166, in main
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 55, in read_command
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 169, in _read_with_length
    return self.loads(obj)
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 451, in loads
    return pickle.loads(obj, encoding=encoding)
TypeError: an integer is required (got type bytes)

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:336)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2853)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2366)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:245)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 166, in main
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 55, in read_command
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 169, in _read_with_length
    return self.loads(obj)
  File "C:\Apache-spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 451, in loads
    return pickle.loads(obj, encoding=encoding)
TypeError: an integer is required (got type bytes)

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
v=pipelineFit.transform(df) 
   predictions = v.prediction
   print(predictions) 
   print(v.label) 
   break 