In [1]:
import pandas as pd
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

In [2]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


In [3]:
sc.master


'local[4]'

In [4]:
X_new = pd.read_csv("x_new.csv")
X_new

Unnamed: 0,Sentence,Date,sentiment-sentiment_score,Hour,price1,label1,volume1,Sentiment,score
0,A Quick Garbled Circuits Primer -- vbuterin [v...,2020-03-23,"{'label': 'NEGATIVE', 'score': 0.9994949}",1.0,5989.014414,5982.209648,1.233530e+07,-1,0.999495
1,"While You’re Under Quarantine, Check These Sit...",2020-03-23,"{'label': 'NEGATIVE', 'score': 0.9978037}",1.0,5982.209648,5945.645879,1.469059e+07,-1,0.997804
2,This is why we Bitcoin!,2020-03-23,"{'label': 'NEGATIVE', 'score': 0.99105054}",13.0,5945.645879,5958.296062,7.071329e+06,-1,0.991051
3,Literally me when dogecoin increases in value,2020-03-23,"{'label': 'NEGATIVE', 'score': 0.97125334}",4.0,5958.296062,5938.628824,8.727236e+06,-1,0.971253
4,Bitcoin May Be One of the Last Assets Still Tr...,2020-03-23,"{'label': 'NEGATIVE', 'score': 0.9984334}",12.0,5938.628824,5878.498745,1.074222e+07,-1,0.998433
...,...,...,...,...,...,...,...,...,...
544,🔥Invading Ethereum USDT,2020-04-22,"{'label': 'NEGATIVE', 'score': 0.8824061}",14.0,6938.118922,6920.762739,9.895946e+06,-1,0.882406
545,What Holds Real Decentralization Back,2020-04-22,"{'label': 'POSITIVE', 'score': 0.89295495}",16.0,6920.762739,6908.007195,1.137203e+07,1,0.892955
546,Russians turn to cash and Bitcoin as coronavir...,2020-04-22,"{'label': 'NEGATIVE', 'score': 0.99790883}",13.0,6908.007195,6924.069889,6.563974e+06,-1,0.997909
547,"Tether Mints $720,000,000 This Month | More Th...",2020-04-22,"{'label': 'POSITIVE', 'score': 0.7023987}",17.0,6924.069889,6915.154614,6.009643e+06,1,0.702399


In [5]:
# Therefore X1 = data and y1 = label
DATA = X_new.drop(columns = ['sentiment-sentiment_score', 'Sentence'])

y1 = X_new["label1"]

In [6]:
# normalizing the data
from scipy.stats import zscore
X1 = DATA.drop(columns = ['Date', 'Hour', 'label1'])

normalized_data = DATA.drop(columns=['Date', 'Hour', 'Sentiment', 'score']).apply(zscore)
X2 = pd.concat([normalized_data, X1['Sentiment'], X1['score']], axis=1)
# y2 = normalized_data['label1']
mean, std = DATA['label1'].mean(), DATA['label1'].std()
# X2 = X2.drop(columns = ['label1'])
X_new1 = pd.concat([X2, X_new['Sentence']], axis=1)
X_new1.head()
# X_new1.to_csv("X_new1.csv", index=False)

Unnamed: 0,price1,label1,volume1,Sentiment,score,Sentence
0,-2.190097,-2.223646,-0.250351,-1,0.999495,A Quick Garbled Circuits Primer -- vbuterin [v...
1,-2.209513,-2.328425,-0.091589,-1,0.997804,"While You’re Under Quarantine, Check These Sit..."
2,-2.313844,-2.292174,-0.605176,-1,0.991051,This is why we Bitcoin!
3,-2.277748,-2.348534,-0.493557,-1,0.971253,Literally me when dogecoin increases in value
4,-2.333866,-2.520846,-0.357734,-1,0.998433,Bitcoin May Be One of the Last Assets Still Tr...


In [7]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('X_new1.csv')

In [8]:
df.show(5)

+-------------------+-------------------+--------------------+---------+------------------+--------------------+
|             price1|             label1|             volume1|Sentiment|             score|            Sentence|
+-------------------+-------------------+--------------------+---------+------------------+--------------------+
| -2.190096589425182|-2.2236457145032626| -0.2503508612023599|       -1|0.9994949000000001|A Quick Garbled C...|
| -2.209513252093415| -2.328425203968068|-0.09158871405098785|       -1|0.9978037000000001|While You’re Unde...|
| -2.313844006237227|-2.2921740254756036| -0.6051760944225335|       -1|        0.99105054|This is why we Bi...|
|-2.2777480793526905| -2.348533728350398|-0.49355732727368773|       -1|        0.97125334|Literally me when...|
| -2.333866412571173| -2.520846352333401| -0.3577342473096958|       -1|         0.9984334|Bitcoin May Be On...|
+-------------------+-------------------+--------------------+---------+------------------+-----

In [9]:
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)


## HashingTF + IDF + Logistic Regression


In [10]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [31]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="Sentence", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "label1", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+-------------------+-------------------+--------------------+---------+------------------+--------------------+--------------------+--------------------+--------------------+-----+
|             price1|             label1|             volume1|Sentiment|             score|            Sentence|               words|                  tf|            features|label|
+-------------------+-------------------+--------------------+---------+------------------+--------------------+--------------------+--------------------+--------------------+-----+
|-2.5978708905433323|-2.4823321185814162|0.040530937390662096|       -1|        0.98433506|"Fed's ""infinite...|["fed's, ""infini...|(65536,[9639,1288...|(65536,[9639,1288...|329.0|
|-2.5054410795262405| -2.613673711717285| 0.07452559523532641|       -1|        0.87768614|     Infinite money!|  [infinite, money!]|(65536,[1285,4083...|(65536,[1285,4083...|188.0|
|-2.4670917892869006|-2.3569688629578414|-0.18347167045309548|       -1|0.9941818000000001

In [12]:
# (X_train, X_val), (y_train, y_val) = train_df.randomSplit([0.98, 0.01], seed = 2000), val_df.randomSplit([0.98, 0.01], seed = 2000)

In [13]:
train_df

DataFrame[price1: double, label1: double, volume1: double, Sentiment: int, score: double, Sentence: string, words: array<string>, tf: vector, features: vector, label: double]

In [14]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

In [16]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 40
objectiveHistory: [0.49907749077490776, 0.3767715253777348, 0.34288743771708513, 0.3263348561313156, 0.32036744912436294, 0.31496830767439676, 0.3127400802944299, 0.3109422931193155, 0.30953941406858204, 0.30941686324490697, 0.3088637924875202, 0.30878923077435, 0.30869341758358426, 0.3086468914616747, 0.30863408117087343, 0.30862835500319896, 0.30862399158153037, 0.30862146210556174, 0.30862032356406155, 0.3086198608631712, 0.3086197078958804, 0.30861948176244763, 0.30861941883837773, 0.30861929966408047, 0.30861928326903737, 0.3086192671009558, 0.3086192560517426, 0.3086192537260935, 0.30861925233938187, 0.3086192515390836, 0.3086192508956476, 0.30861925070076346, 0.3086192505436628, 0.308619250509589, 0.3086192504752059, 0.30861925046309757, 0.3086192504521905, 0.30861925044964406, 0.3086192504471478, 0.3086192504450794]
+-------------------+
|          residuals|
+-------------------+
| 2.6472693157858203|
|  -82.6292204642175|
| 108.58742464384727|
|-121.26036103

In [28]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="price1", \
                 labelCol="label1",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.964584


In [30]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'label1')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(val_df)
dt_evaluator = RegressionEvaluator(
    labelCol="label1", predictionCol="price1", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0605826


In [59]:
# convert it to pandas dataframe
# mention local thing in the presentation.

Py4JJavaError: An error occurred while calling o337.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 133.0 failed 1 times, most recent failure: Lost task 0.0 in stage 133.0 (TID 133, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Unseen label: 0.2331107140041396.  To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:260)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 18 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$9: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: org.apache.spark.SparkException: Unseen label: 0.2331107140041396.  To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:260)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$9.apply(StringIndexer.scala:246)
	... 18 more
