In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('nlp').getOrCreate()

In [2]:
pwd

'/home/swap9047/Downloads/spark-and-python-for-big-data-with-pyspark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Natural_Language_Processing'

In [3]:
data =spark.read.csv('smsspamcollection/SMSSpamCollection',inferSchema=True,sep='\t')

In [4]:
data.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [5]:
data=data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')
data.columns

['class', 'text']

In [6]:
from pyspark.sql.functions import length
data=data.withColumn('length',length(data['text']))

In [7]:
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [8]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [9]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer

In [11]:
tokenizer=Tokenizer(inputCol='text',outputCol='token_text')
stop_remove=StopWordsRemover(inputCol='token_text',outputCol='stop_text')
count_vec=CountVectorizer(inputCol='stop_text',outputCol='c_vec')
idf=IDF(inputCol='c_vec',outputCol='tf_idf')
ham_spam_to_numeric=StringIndexer(inputCol='class',outputCol='label')

In [12]:
from pyspark.ml.feature import VectorAssembler
clean_up=VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

In [13]:
from pyspark.ml.classification import NaiveBayes
nb=NaiveBayes()

In [15]:
from pyspark.ml import Pipeline

In [17]:
data_prep_pipeline=Pipeline(stages=[ham_spam_to_numeric,tokenizer,stop_remove,count_vec,idf,clean_up])

In [19]:
clean_fit=data_prep_pipeline.fit(data)
clean_data=clean_fit.transform(data)

In [21]:
clean_data=clean_data.select('label','features')

In [22]:
train_data,test_data=clean_data.randomSplit([0.7,0.3])

In [23]:
spam_detector=nb.fit(train_data)

In [34]:
test_result=spam_detector.transform(test_data)

AttributeError: 'NaiveBayesModel' object has no attribute 'evaluate'

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluation=MulticlassClassificationEvaluator()

In [27]:
acc_eval=evaluation.evaluate(test_result)

In [28]:
print(acc_eval)

0.9258736120751352


In [31]:
acc_eval.real

0.9258736120751352

In [30]:
test_ev=spam_detector.evaluate(test_data)

AttributeError: 'NaiveBayesModel' object has no attribute 'evaluate'

In [32]:
from pyspark.ml.classification import LogisticRegression
lr=LogisticRegression()
lr_model=lr.fit(train_data)
resu=lr_model.evaluate(test_data)

0.9489552028962589