# Question3 

## create/import data

In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
sentenceData = spark.createDataFrame([
    (1.0, "happy"),
    (1.0, "happy,happy"),
    (0.0, "happy,sad"),
    (0.0, "sad"),
    (0.0, "sad,happy"),
    (1.0, "happy,happy")
], ["label", "text"])

In [None]:

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
df = spark.read.json('/Users/gongqian/Desktop/spark_test/Q3/russian/train_small.json')
sentenceData = df.select("sentiment","text").toDF("label","text")


In [2]:
sentenceData.show(10,False)

+-----+-----------+
|label|text       |
+-----+-----------+
|1.0  |happy      |
|1.0  |happy,happy|
|0.0  |happy,sad  |
|0.0  |sad        |
|0.0  |sad,happy  |
|1.0  |happy,happy|
+-----+-----------+



In [None]:
dic={happy:0,sad:1}
1 0:1
1 0:2
0 0:1,1:1
0 1:1
0 1:1,0:1
1 0:2

## tokenize text 

In [3]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

In [4]:
wordsData.show(10,False)

+-----+-----------+-------------+
|label|text       |words        |
+-----+-----------+-------------+
|1.0  |happy      |[happy]      |
|1.0  |happy,happy|[happy,happy]|
|0.0  |happy,sad  |[happy,sad]  |
|0.0  |sad        |[sad]        |
|0.0  |sad,happy  |[sad,happy]  |
|1.0  |happy,happy|[happy,happy]|
+-----+-----------+-------------+



## TF-IDF 

In [5]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

In [6]:
featurizedData.show(10,False)

+-----+-----------+-------------+-------------+
|label|text       |words        |rawFeatures  |
+-----+-----------+-------------+-------------+
|1.0  |happy      |[happy]      |(2,[1],[1.0])|
|1.0  |happy,happy|[happy,happy]|(2,[1],[1.0])|
|0.0  |happy,sad  |[happy,sad]  |(2,[0],[1.0])|
|0.0  |sad        |[sad]        |(2,[0],[1.0])|
|0.0  |sad,happy  |[sad,happy]  |(2,[0],[1.0])|
|1.0  |happy,happy|[happy,happy]|(2,[1],[1.0])|
+-----+-----------+-------------+-------------+



In [7]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show(10,False)

+-----+----------------------------+
|label|features                    |
+-----+----------------------------+
|1.0  |(2,[1],[0.5596157879354227])|
|1.0  |(2,[1],[0.5596157879354227])|
|0.0  |(2,[0],[0.5596157879354227])|
|0.0  |(2,[0],[0.5596157879354227])|
|0.0  |(2,[0],[0.5596157879354227])|
|1.0  |(2,[1],[0.5596157879354227])|
+-----+----------------------------+



### split train/test data

In [8]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data


# Split the data into train and test
splits = rescaledData.select("label", "features").randomSplit([0.8, 0.2], 1234)
train = splits[1]
test = splits[0]

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [9]:
train.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(2,[0],[0.5596157...|
|  1.0|(2,[1],[0.5596157...|
|  1.0|(2,[1],[0.5596157...|
|  0.0|(2,[0],[0.5596157...|
+-----+--------------------+



## build the model 

In [10]:
# train the model
model = nb.fit(train)

# select example rows to display.
predictions = model.transform(test)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
predictions.show(30,False)
print("Test set accuracy = " + str(accuracy))

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(2,[0],[0.5596157...|[-0.9094572187149...|[0.60355537694346...|       0.0|
|  1.0|(2,[1],[0.5596157...|[-1.3297586477855...|[0.39644462305653...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+

+-----+----------------------------+-----------------------------------------+----------------------------------------+----------+
|label|features                    |rawPrediction                            |probability                             |prediction|
+-----+----------------------------+-----------------------------------------+----------------------------------------+----------+
|0.0  |(2,[0],[0.5596157879354227])|[-0.9094572187149538,-1.3297586477855063]|[0.6035553769434613,0.39644462305653