In [2]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('9_NPL2_Text_Classification').getOrCreate()

file_location = "9_financial_news.csv"
file_type = "csv"
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","
text_df = spark.read.format(file_type) \
 .option("inferSchema", infer_schema) \
 .option("header", first_row_is_header) \
 .option("sep", delimiter) \
 .load(file_location)
text_df.show(5,False)


                                                                                

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|News                                                                                                                                                                                                                                               |Sentiment|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|- BEIJING XFN-ASIA - Hong Kong-listed Standard Chartered Bank said it has signed a China mobile phone dealer financing agreement with Nokia , making it the first foreign bank to offer financing to the country 's small and medium en

In [2]:
text_df.count()

                                                                                

962

In [3]:
text_df=text_df.filter(((text_df.Sentiment =='0') | (text_df.Sentiment =='1') ) )
text_df.groupBy('Sentiment').count().show()

[Stage 5:>                                                          (0 + 1) / 1]

+---------+-----+
|Sentiment|count|
+---------+-----+
|        0|  177|
|        1|  785|
+---------+-----+



                                                                                

In [4]:
text_df=text_df.withColumn("Label",text_df.Sentiment.cast('float')).drop('Sentiment')

In [5]:
from pyspark.sql.functions import rand
text_df.orderBy(rand()).show(10,False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|News                                                                                                                                                                                                                                                                         |Label|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|Revenue grew 12 percent to (  x20ac ) 3.6 billion ( US$ 4.5 billion ) .                                                                                              

In [6]:
from pyspark.sql.functions import length
text_df=text_df.withColumn('length',length(text_df['News']))
text_df.orderBy(rand()).show(10)

+--------------------+-----+------+
|                News|Label|length|
+--------------------+-----+------+
|21 October 2010 -...|  1.0|   212|
|With this subscri...|  0.0|   120|
|When the OMX mark...|  0.0|   194|
|A maximum of 666,...|  0.0|   120|
|In Q1 of 2010 , B...|  1.0|    95|
|Earnings per shar...|  1.0|    64|
|With this appoint...|  1.0|   156|
|Kalnapilio-Tauro ...|  1.0|   242|
|Finland 's Poyry ...|  1.0|   185|
|- UPM-Kymmene upg...|  1.0|    78|
+--------------------+-----+------+
only showing top 10 rows



In [7]:
text_df.groupBy('Label').agg({'Length':'mean'}).show()

+-----+------------------+
|Label|       avg(Length)|
+-----+------------------+
|  1.0|134.15031847133758|
|  0.0| 142.5084745762712|
+-----+------------------+



In [8]:
# Start the tokenization process
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='News',outputCol='tokens')
tokenized_df=tokenization.transform(text_df)
tokenized_df.show()

+--------------------+-----+------+--------------------+
|                News|Label|length|              tokens|
+--------------------+-----+------+--------------------+
|- BEIJING XFN-ASI...|  1.0|   243|[-, beijing, xfn-...|
|- Operating profi...|  1.0|    65|[-, operating, pr...|
|- Provides summar...|  0.0|    94|[-, provides, sum...|
|- So , the sales ...|  0.0|    79|[-, so, ,, the, s...|
|- UPM-Kymmene upg...|  1.0|    78|[-, upm-kymmene, ...|
|( ADP News ) - De...|  1.0|   231|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   209|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   187|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   200|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   194|[(, adp, news, ),...|
|( ADP News ) - No...|  1.0|   231|[(, adp, news, ),...|
|( ADP News ) - Oc...|  1.0|   231|[(, adp, news, ),...|
|( ADP News ) - Oc...|  1.0|   231|[(, adp, news, ),...|
|( ADP News ) - Oc...|  1.0|   231|[(, adp, news, ),...|
|( ADP News ) - Se...|  1.0|   

In [9]:
# remove stopwords
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_text_df=stopword_removal.transform(tokenized_df)
refined_text_df.show(10)

+--------------------+-----+------+--------------------+--------------------+
|                News|Label|length|              tokens|      refined_tokens|
+--------------------+-----+------+--------------------+--------------------+
|- BEIJING XFN-ASI...|  1.0|   243|[-, beijing, xfn-...|[-, beijing, xfn-...|
|- Operating profi...|  1.0|    65|[-, operating, pr...|[-, operating, pr...|
|- Provides summar...|  0.0|    94|[-, provides, sum...|[-, provides, sum...|
|- So , the sales ...|  0.0|    79|[-, so, ,, the, s...|[-, ,, sales, gro...|
|- UPM-Kymmene upg...|  1.0|    78|[-, upm-kymmene, ...|[-, upm-kymmene, ...|
|( ADP News ) - De...|  1.0|   231|[(, adp, news, ),...|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   209|[(, adp, news, ),...|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   187|[(, adp, news, ),...|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   200|[(, adp, news, ),...|[(, adp, news, ),...|
|( ADP News ) - Fe...|  1.0|   194|[(, adp, news, ),...|[(, adp,

In [10]:
# Create another column (token count) that gives the number of tokens in each row
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *
len_udf = udf(lambda s: len(s), IntegerType())
refined_text_df = refined_text_df.withColumn("token_count",len_udf(col('refined_tokens')))
refined_text_df.orderBy(rand()).show(10)

[Stage 15:>                                                         (0 + 1) / 1]

+--------------------+-----+------+--------------------+--------------------+-----------+
|                News|Label|length|              tokens|      refined_tokens|token_count|
+--------------------+-----+------+--------------------+--------------------+-----------+
|Lifetree was foun...|  1.0|   107|[lifetree, was, f...|[lifetree, founde...|         13|
|The mill has long...|  1.0|    79|[the, mill, has, ...|[mill, long, trad...|          8|
|TELE2 Affarsvarld...|  1.0|   145|[tele2, affarsvar...|[tele2, affarsvar...|         24|
|Vaisala also said...|  1.0|   121|[vaisala, also, s...|[vaisala, also, s...|         18|
|In its financial ...|  1.0|   197|[in, its, financi...|[financial, repor...|         29|
|Raute Corporation...|  1.0|   100|[raute, corporati...|[raute, corporati...|         14|
|Finnish sports eq...|  1.0|   232|[finnish, sports,...|[finnish, sports,...|         36|
|The value of the ...|  1.0|    68|[the, value, of, ...|[value, firm, 's,...|         10|
|We hope t

                                                                                

In [11]:
# Convert text into numerical features:
# Use CountVectorizer for feature vectorization for the ML 
from pyspark.ml.feature import CountVectorizer
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')
cv_text_df=count_vec.fit(refined_text_df).transform(refined_text_df)
cv_text_df.select(['refined_tokens','token_count','features','Label']).show(10)

[Stage 20:>                                                         (0 + 1) / 1]

+--------------------+-----------+--------------------+-----+
|      refined_tokens|token_count|            features|Label|
+--------------------+-----------+--------------------+-----+
|[-, beijing, xfn-...|         31|(4035,[1,3,14,27,...|  1.0|
|[-, operating, pr...|         13|(4035,[0,2,7,8,9,...|  1.0|
|[-, provides, sum...|         10|(4035,[0,6,27,70,...|  0.0|
|[-, ,, sales, gro...|         11|(4035,[0,1,5,27,3...|  0.0|
|[-, upm-kymmene, ...|         12|(4035,[0,27,60,15...|  1.0|
|[(, adp, news, ),...|         39|(4035,[1,2,6,8,9,...|  1.0|
|[(, adp, news, ),...|         41|(4035,[0,1,2,8,9,...|  1.0|
|[(, adp, news, ),...|         38|(4035,[0,1,2,8,9,...|  1.0|
|[(, adp, news, ),...|         41|(4035,[0,1,2,6,8,...|  1.0|
|[(, adp, news, ),...|         38|(4035,[0,1,2,6,8,...|  1.0|
+--------------------+-----------+--------------------+-----+
only showing top 10 rows



                                                                                

In [12]:
model_text_df=cv_text_df.select(['features','token_count','Label'])
model_text_df.show(10,False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----+
|features                                                                                                                                                                                                                                                                  |token_count|Label|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-----+
|(4035,[1,3,14,27,34,43,44,68,71,97,119,152,429,451,533,828,830,969,1167,1187,1270,1297,1401,2132,2199,2230,2857,3960],[1.0,1.0,1.0,2.0,1.0

[Stage 21:>                                                         (0 + 1) / 1]                                                                                

In [13]:
# Use VectorAssembler to create input features for the Machine Learning model:
from pyspark.ml.feature import VectorAssembler
df_assembler = VectorAssembler(inputCols=['features','token_count'],outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)
model_text_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- token_count: integer (nullable = true)
 |-- Label: float (nullable = true)
 |-- features_vec: vector (nullable = true)



In [14]:
# proceed with training a logistic regression model:
from pyspark.ml.classification import LogisticRegression
training_df,test_df=model_text_df.randomSplit([0.75,0.25])
training_df.show(10)

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+-----------+-----+--------------------+
|            features|token_count|Label|        features_vec|
+--------------------+-----------+-----+--------------------+
|(4035,[0,1,2,3,4,...|         20|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         20|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         17|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         19|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         18|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         21|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         18|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,4,...|         17|  1.0|(4036,[0,1,2,3,4,...|
|(4035,[0,1,2,3,5,...|         16|  1.0|(4036,[0,1,2,3,5,...|
|(4035,[0,1,2,3,5,...|         16|  1.0|(4036,[0,1,2,3,5,...|
+--------------------+-----------+-----+--------------------+
only showing top 10 rows



                                                                                

In [15]:
training_df.groupBy('Label').count().show()

[Stage 23:>                                                         (0 + 1) / 1]

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  579|
|  0.0|  139|
+-----+-----+



                                                                                

In [16]:
test_df.groupBy('Label').count().show()

[Stage 26:>                                                         (0 + 1) / 1]

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  206|
|  0.0|   38|
+-----+-----+



                                                                                

In [19]:
LR_Model=LogisticRegression(featuresCol='features_vec',labelCol='Label').fit(training_df)
predictions=LR_Model.evaluate(test_df).predictions
predictions.show(10)

                                                                                

22/10/28 16:48:15 ERROR LBFGS: Failure! Resetting history: breeze.optimize.StepSizeUnderflow: 


[Stage 193:>                                                        (0 + 1) / 1]

+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|            features|token_count|Label|        features_vec|       rawPrediction|         probability|prediction|
+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|(4035,[0,1,2,3,4,...|         24|  1.0|(4036,[0,1,2,3,4,...|[-24.078963252882...|[3.48850322902546...|       1.0|
|(4035,[0,1,2,3,4,...|         15|  1.0|(4036,[0,1,2,3,4,...|[-13.709397126724...|[1.11194676256611...|       1.0|
|(4035,[0,1,2,3,6,...|         24|  0.0|(4036,[0,1,2,3,6,...|[5.91092800688817...|[0.99729765134933...|       0.0|
|(4035,[0,1,2,3,10...|         17|  1.0|(4036,[0,1,2,3,10...|[-19.068883455673...|[5.22984884454914...|       1.0|
|(4035,[0,1,2,3,17...|         34|  1.0|(4036,[0,1,2,3,17...|[-12.518715830999...|[3.65754102094011...|       1.0|
|(4035,[0,1,2,3,26...|         16|  0.0|(4036,[0,1,2,3,26...|[22.6285715263782..

                                                                                

In [26]:
# Evaluate the performance of the logistic regression model using accuracy metrics on test data.
predictions.select("features", "label", "probability", "prediction").show(5)

[Stage 195:>                                                        (0 + 1) / 1]

+--------------------+-----+--------------------+----------+
|            features|label|         probability|prediction|
+--------------------+-----+--------------------+----------+
|(4035,[0,1,2,3,4,...|  1.0|[3.48850322902546...|       1.0|
|(4035,[0,1,2,3,4,...|  1.0|[1.11194676256611...|       1.0|
|(4035,[0,1,2,3,6,...|  0.0|[0.99729765134933...|       0.0|
|(4035,[0,1,2,3,10...|  1.0|[5.22984884454914...|       1.0|
|(4035,[0,1,2,3,17...|  1.0|[3.65754102094011...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows



                                                                                

In [49]:
# Compute the number of class in the label and the prediction:
M = predictions.select("label", "prediction")          
M.groupby('label').agg({'label': 'count'}).show()  
M.groupby('prediction').agg({'prediction': 'count'}).show()

                                                                                

+-----+------------+
|label|count(label)|
+-----+------------+
|  1.0|         206|
|  0.0|          38|
+-----+------------+

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|               44|
|       1.0|              200|
+----------+-----------------+



In [50]:
predictions.groupBy('label', 'prediction').count().show()

[Stage 265:>                                                        (0 + 1) / 1]

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       0.0|   22|
|  0.0|       0.0|   22|
|  1.0|       1.0|  184|
|  0.0|       1.0|   16|
+-----+----------+-----+



                                                                                

In [85]:
Accuracy = M.filter(M.label == M.prediction).count()/M.count()
print("Accuracy:",Accuracy)

Accuracy: 0.8442622950819673


[Stage 494:>                                                        (0 + 1) / 1]                                                                                

In [90]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabel = predictions.select("prediction", "Label").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

# Overall statistics 
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 

print("---Evaluation Metrics----")
print("Accuracy:",format(evaluator.evaluate(predictions)))
print("Precision:", precision) 
print("Recall:", recall)
      
# Area under precision-recall curve 
print("Area under PR:",metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC:",metricsBinary.areaUnderROC)

---Evaluation Metrics----
Accuracy: 0.8442622950819673
Precision: 0.92
Recall: 0.8932038834951457
Area under PR: 0.9159557536208818
Area under ROC: 0.7360756259580992


In [92]:
from pyspark.ml.classification import RandomForestClassifier
RF_Model=RandomForestClassifier(featuresCol='features_vec',labelCol='Label').fit(training_df)
predictions=RF_Model.evaluate(test_df).predictions
predictions.show(10)

[Stage 556:>                                                        (0 + 1) / 1]

+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|            features|token_count|Label|        features_vec|       rawPrediction|         probability|prediction|
+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|(4035,[0,1,2,3,4,...|         24|  1.0|(4036,[0,1,2,3,4,...|[2.91698764482767...|[0.14584938224138...|       1.0|
|(4035,[0,1,2,3,4,...|         15|  1.0|(4036,[0,1,2,3,4,...|[3.11237226021228...|[0.15561861301061...|       1.0|
|(4035,[0,1,2,3,6,...|         24|  0.0|(4036,[0,1,2,3,6,...|[3.51774214969079...|[0.17588710748453...|       1.0|
|(4035,[0,1,2,3,10...|         17|  1.0|(4036,[0,1,2,3,10...|[3.23706206129527...|[0.16185310306476...|       1.0|
|(4035,[0,1,2,3,17...|         34|  1.0|(4036,[0,1,2,3,17...|[4.07771621495986...|[0.20388581074799...|       1.0|
|(4035,[0,1,2,3,26...|         16|  0.0|(4036,[0,1,2,3,26...|[5.44029228606349..

                                                                                

In [93]:
# Evaluate the performance of the Random Forest model using accuracy metrics on test data.
predictions.select("features", "label", "probability", "prediction").show(5)

+--------------------+-----+--------------------+----------+
|            features|label|         probability|prediction|
+--------------------+-----+--------------------+----------+
|(4035,[0,1,2,3,4,...|  1.0|[0.14584938224138...|       1.0|
|(4035,[0,1,2,3,4,...|  1.0|[0.15561861301061...|       1.0|
|(4035,[0,1,2,3,6,...|  0.0|[0.17588710748453...|       1.0|
|(4035,[0,1,2,3,10...|  1.0|[0.16185310306476...|       1.0|
|(4035,[0,1,2,3,17...|  1.0|[0.20388581074799...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows



In [94]:
# Compute the number of class in the label and the prediction:
M = predictions.select("label", "prediction")          
M.groupby('label').agg({'label': 'count'}).show()  
M.groupby('prediction').agg({'prediction': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|  1.0|         206|
|  0.0|          38|
+-----+------------+

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       1.0|              244|
+----------+-----------------+



In [95]:
predictions.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  206|
|  0.0|       1.0|   38|
+-----+----------+-----+



In [96]:
Accuracy = M.filter(M.label == M.prediction).count()/M.count()
print("Accuracy:",Accuracy)

[Stage 567:>                                                        (0 + 1) / 1]                                                                                

Accuracy: 0.8442622950819673


In [97]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabel = predictions.select("prediction", "Label").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

# Overall statistics 
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 

print("---Evaluation Metrics----")
print("Accuracy:",format(evaluator.evaluate(predictions)))
print("Precision:", precision) 
print("Recall:", recall)
      
# Area under precision-recall curve 
print("Area under PR:",metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC:",metricsBinary.areaUnderROC)

[Stage 575:>                                                        (0 + 1) / 1]                                                                                

---Evaluation Metrics----
Accuracy: 0.8442622950819673
Precision: 0.8442622950819673
Recall: 1.0
Area under PR: 0.8442622950819673
Area under ROC: 0.5


In [103]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.classification import GBTClassifier
GB_Model=GBTClassifier(featuresCol='features_vec',labelCol='Label',maxIter=10).fit(training_df)
predictions = GB_Model.transform(test_df)
predictions.show(10)

                                                                                

+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|            features|token_count|Label|        features_vec|       rawPrediction|         probability|prediction|
+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|(4035,[0,1,2,3,4,...|         24|  1.0|(4036,[0,1,2,3,4,...|[-1.2498224622239...|[0.07588307588621...|       1.0|
|(4035,[0,1,2,3,4,...|         15|  1.0|(4036,[0,1,2,3,4,...|[-1.2498224622239...|[0.07588307588621...|       1.0|
|(4035,[0,1,2,3,6,...|         24|  0.0|(4036,[0,1,2,3,6,...|[-0.7753704067183...|[0.17497929744182...|       1.0|
|(4035,[0,1,2,3,10...|         17|  1.0|(4036,[0,1,2,3,10...|[-1.0012518184503...|[0.11894030671609...|       1.0|
|(4035,[0,1,2,3,17...|         34|  1.0|(4036,[0,1,2,3,17...|[-0.7753704067183...|[0.17497929744182...|       1.0|
|(4035,[0,1,2,3,26...|         16|  0.0|(4036,[0,1,2,3,26...|[1.06682040756522..

In [104]:
# Evaluate the performance of the Gradient-boosted tree model using accuracy metrics on test data.
predictions.select("features", "label", "probability", "prediction").show(5)

+--------------------+-----+--------------------+----------+
|            features|label|         probability|prediction|
+--------------------+-----+--------------------+----------+
|(4035,[0,1,2,3,4,...|  1.0|[0.07588307588621...|       1.0|
|(4035,[0,1,2,3,4,...|  1.0|[0.07588307588621...|       1.0|
|(4035,[0,1,2,3,6,...|  0.0|[0.17497929744182...|       1.0|
|(4035,[0,1,2,3,10...|  1.0|[0.11894030671609...|       1.0|
|(4035,[0,1,2,3,17...|  1.0|[0.17497929744182...|       1.0|
+--------------------+-----+--------------------+----------+
only showing top 5 rows



[Stage 1315:>                                                       (0 + 1) / 1]                                                                                

In [105]:
# Compute the number of class in the label and the prediction:
M = predictions.select("label", "prediction")          
M.groupby('label').agg({'label': 'count'}).show()  
M.groupby('prediction').agg({'prediction': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|  1.0|         206|
|  0.0|          38|
+-----+------------+

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|               14|
|       1.0|              230|
+----------+-----------------+



In [106]:
predictions.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       0.0|    5|
|  0.0|       0.0|    9|
|  1.0|       1.0|  201|
|  0.0|       1.0|   29|
+-----+----------+-----+



In [107]:
Accuracy = M.filter(M.label == M.prediction).count()/M.count()
print("Accuracy:",Accuracy)

Accuracy: 0.860655737704918


In [108]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabel = predictions.select("prediction", "Label").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Label", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

# Overall statistics 
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 

print("---Evaluation Metrics----")
print("Accuracy:",format(evaluator.evaluate(predictions)))
print("Precision:", precision) 
print("Recall:", recall)
      
# Area under precision-recall curve 
print("Area under PR:",metricsBinary.areaUnderPR) 
# Area under ROC curve 
print("Area under ROC:",metricsBinary.areaUnderROC)

                                                                                

---Evaluation Metrics----


                                                                                

Accuracy: 0.860655737704918
Precision: 0.8739130434782608
Recall: 0.9757281553398058
Area under PR: 0.8735532042986942
Area under ROC: 0.6062851303014819
22/10/29 06:58:43 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 46139359 ms exceeds timeout 120000 ms
22/10/29 06:58:43 WARN SparkContext: Killing executors is not supported by current scheduler.
