In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('twitter-sentiment').getOrCreate()
spark.sparkContext.addPyFile("/home/jovyan/work/sentiment_model.py")
print("Spark context started")

Spark context started


In [2]:
spark.sparkContext.addPyFile?

[0;31mSignature:[0m [0mspark[0m[0;34m.[0m[0msparkContext[0m[0;34m.[0m[0maddPyFile[0m[0;34m([0m[0mpath[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add a .py or .zip dependency for all tasks to be executed on this
SparkContext in the future.  The C{path} passed can be either a local
file, a file in HDFS (or other Hadoop-supported filesystems), or an
HTTP, HTTPS or FTP URI.

.. note:: A path can be added only once. Subsequent additions of the same path are ignored.
[0;31mFile:[0m      /usr/local/spark/python/pyspark/context.py
[0;31mType:[0m      method


In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType

schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("id", LongType(), True),
    StructField("raw_timestamp", StringType(), True),
    StructField("query_status", StringType(), True),
    StructField("author", StringType(), True),
    StructField("tweet", StringType(), True)
])
    
data_path = "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

raw_sentiment = spark.read.csv(data_path,header=False,schema=schema) \
    .selectExpr("(case when target=4 then 1 else 0 end) as target","tweet")



raw_sentiment.groupBy("target").count().show()

+------+------+
|target| count|
+------+------+
|     1|800000|
|     0|800000|
+------+------+



In [4]:
raw_sentiment.filter("target = 0").show(3, 100)
raw_sentiment.filter("target = 1").show(3, 100)

+------+----------------------------------------------------------------------------------------------------+
|target|                                                                                               tweet|
+------+----------------------------------------------------------------------------------------------------+
|     0|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Thir...|
|     0|is upset that he can't update his Facebook by texting it... and might cry as a result  School tod...|
|     0|           @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds|
+------+----------------------------------------------------------------------------------------------------+
only showing top 3 rows

+------+----------------------------------------------------------------------------------------------------+
|target|                                                                                       

In [5]:
import pickle as pkl

def read_model(model_path):
    with open(model_path,'rb') as buffer:
        return pkl.load(buffer)

model_path = "/home/jovyan/models/tweet_sentiment.mdl"

model_object = read_model(model_path)
model_object

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function preprocessor at 0x7f656886dd90>,
                                 smooth_idf=True, stop_words=None,
                                 strip_accents=None,...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=8,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,

In [7]:
model_object_broadcast = spark.sparkContext.broadcast(model_object)

def block_iterator(iterator, size):
    bucket = list()
    for e in iterator:
        bucket.append(e)
        if len(bucket) >= size:
            yield bucket
            bucket = list()
    if bucket:
        yield bucket

def block_classify(iterator):
    model = model_object_broadcast.value
    for features in block_iterator(iterator, 10000):
        import pandas as pd
        import json
        features_df = pd.DataFrame(list(features), columns=["target","text"])
        pred = model.predict_proba(features_df["text"])
        res_df = features_df
        res_df["proba0"] = pred[:,0]
        res_df["proba1"] = pred[:,1]
        for e in json.loads(res_df.to_json(orient='records')):
            yield e

predicted_df = raw_sentiment.rdd.mapPartitions(block_classify).toDF()

predicted_df.where("target=0").show(5,100)
predicted_df.where("target=1").show(5,100)



+------------+------------+------+----------------------------------------------------------------------------------------------------+
|      proba0|      proba1|target|                                                                                                text|
+------------+------------+------+----------------------------------------------------------------------------------------------------+
|0.5140581764|0.4859418236|     0|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Thir...|
|0.5157546478|0.4842453522|     0|is upset that he can't update his Facebook by texting it... and might cry as a result  School tod...|
|0.5040259259|0.4959740741|     0|           @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds|
|0.5096420157|0.4903579843|     0|                                                     my whole body feels itchy and like its on fire |
|0.5040259259|0.4959740741|     0|@nationwidecla

In [8]:
spark.stop()