In [2]:
import pyspark as ps
import warnings
from pyspark.sql import SQLContext

In [3]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Just created a SparkContext


In [4]:
sc.master

'local[4]'

In [34]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('clean_tweet.csv')

In [35]:
type(df)

pyspark.sql.dataframe.DataFrame

In [36]:
df.show(5)

+---+--------------------+------+
|_c0|                text|target|
+---+--------------------+------+
|  0|awww that bummer ...|     0|
|  1|is upset that he ...|     0|
|  2|dived many times ...|     0|
|  3|my whole body fee...|     0|
|  4|no it not behavin...|     0|
+---+--------------------+------+
only showing top 5 rows



In [37]:
df = df.dropna()

In [38]:
df.count()

1596041

In [39]:
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [40]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [41]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+---+--------------------+------+--------------------+--------------------+--------------------+-----+
|_c0|                text|target|               words|                  tf|            features|label|
+---+--------------------+------+--------------------+--------------------+--------------------+-----+
|  0|awww that bummer ...|     0|[awww, that, bumm...|(65536,[8436,8847...|(65536,[8436,8847...|  0.0|
|  1|is upset that he ...|     0|[is, upset, that,...|(65536,[1444,2071...|(65536,[1444,2071...|  0.0|
|  2|dived many times ...|     0|[dived, many, tim...|(65536,[2548,2888...|(65536,[2548,2888...|  0.0|
|  3|my whole body fee...|     0|[my, whole, body,...|(65536,[158,11650...|(65536,[158,11650...|  0.0|
|  4|no it not behavin...|     0|[no, it, not, beh...|(65536,[1968,4488...|(65536,[1968,4488...|  0.0|
+---+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [42]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=1000)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

In [43]:
predictions.show(5)

+---+--------------------+------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|_c0|                text|target|               words|                  tf|            features|label|       rawPrediction|         probability|prediction|
+---+--------------------+------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|324|agree the shapesh...|     0|[agree, the, shap...|(65536,[338,7560,...|(65536,[338,7560,...|  0.0|[-2.7485114627838...|[0.06017077223588...|       1.0|
|439|      oh my yes miss|     0| [oh, my, yes, miss]|(65536,[36127,378...|(65536,[36127,378...|  0.0|[1.88049337239556...|[0.86766778595005...|       0.0|
|461|just heard that t...|     0|[just, heard, tha...|(65536,[4200,7661...|(65536,[4200,7661...|  0.0|[1.06943371566866...|[0.74448920929041...|       0.0|
|582|brian do not make...|     0|[brian, do, not, ...|(65536,[41

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.8551475804015439

In [45]:
evaluator.getMetricName()

'areaUnderROC'

In [46]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
accuracy

0.7847853535353535

In [61]:
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp

In [66]:
CURRENCY = "bitcoin"
CURRENCY_SYMBOL = "BTC"
tweets_clean_file = f'data/twitter/{CURRENCY_SYMBOL}/{CURRENCY}_tweets_clean.csv'

df1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(tweets_clean_file)
#view the data
df1.show(5)

#print the schema of the file

+---+--------------------+------+
|_c0|                text|target|
+---+--------------------+------+
|  0|become bitcoin de...|     0|
|  1|btc buying pressu...|     0|
|  2|rt embarrassed to...|     0|
|  3|rt hpt price usdt...|     0|
|  4|rt ll give btc or...|     0|
+---+--------------------+------+
only showing top 5 rows



In [67]:
train_df = pipelineFit.transform(df1)

In [68]:
predictions_new = lrModel.transform(val_df)

In [71]:
predictions.select("prediction").show()

+----------+
|prediction|
+----------+
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       1.0|
|       1.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
+----------+
only showing top 20 rows

