In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [4]:
spark = SparkSession.builder.appName("Practice Session").getOrCreate()

## Part 1: Recommender System

In [5]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master\Spark_for_Machine_Learning\Recommender_Systems\movielens_ratings.csv', inferSchema=True, header=True)

In [9]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
train_data, test_data = data.randomSplit([.8, .2])

In [15]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [16]:
als_model= als.fit(train_data)

In [17]:
predictions = als_model.transform(test_data)

In [18]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|      4|   4.0|    26| -0.3157192|
|      6|   1.0|    12|  1.3937502|
|      6|   2.0|    22|-0.11653066|
|      7|   1.0|    16|  1.6723509|
|      0|   1.0|    20|   1.048294|
|      2|   2.0|    20|  0.9533929|
|      4|   2.0|    20|  -1.589823|
|      1|   1.0|     5|  1.9950407|
|      5|   1.0|     5|   1.583868|
|      4|   1.0|    19|  1.7523725|
|      7|   1.0|    15|-0.32536155|
|      2|   3.0|     9|  2.9005303|
|      3|   1.0|     9|  0.5388041|
|      7|   1.0|     8|  1.3429217|
|      4|   1.0|    23|  1.2591652|
|      3|   1.0|     7|  2.0860949|
|      0|   3.0|    10|  0.8959279|
|      7|   1.0|    10|   2.914962|
|      7|   1.0|    24| -1.9151883|
|      4|   1.0|    14| 0.77249444|
+-------+------+------+-----------+
only showing top 20 rows



In [19]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [23]:
rmse = evaluator.evaluate(predictions)
print(rmse)

1.8479814686253768


In [24]:
single_user = test_data.filter(test_data['userId']==1).select(['movieId', 'userId'])

In [26]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     12|     1|
|     16|     1|
|     27|     1|
|     37|     1|
|     40|     1|
|     41|     1|
|     57|     1|
|     62|     1|
|     74|     1|
|     76|     1|
|     77|     1|
|     78|     1|
|     82|     1|
|     91|     1|
|     92|     1|
|     97|     1|
+-------+------+



In [27]:
recommendations = als_model.transform(single_user)

In [28]:
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+-----------+
|movieId|userId| prediction|
+-------+------+-----------+
|     92|     1|   4.253003|
|     74|     1|  3.7711267|
|     37|     1|  1.6259863|
|     62|     1|  1.5885861|
|     91|     1|  1.4808776|
|     40|     1|  1.0856148|
|     12|     1|  1.0779605|
|     16|     1|  0.9525912|
|     78|     1|  0.9385193|
|     77|     1| 0.92725897|
|     57|     1|  0.7126682|
|     82|     1|  0.6401354|
|     76|     1| 0.32178092|
|     41|     1|0.047453385|
|     27|     1|-0.14780186|
|     97|     1| -0.3149649|
+-------+------+-----------+



## Part 2: Natural Language Processing

In [30]:
data = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Natural_Language_Processing/smsspamcollection/SMSSpamCollection', inferSchema=True, sep='\t')

In [34]:
data = data.withColumnRenamed('_c0', 'class')
data = data.withColumnRenamed('_c1', 'text')

In [35]:
data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [36]:
from pyspark.sql.functions import length

In [37]:
data = data.withColumn('length', length(data['text']))

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [40]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [45]:
token = Tokenizer(inputCol='text', outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')

In [47]:
from pyspark.ml.feature import VectorAssembler

In [50]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

In [51]:
from pyspark.ml.classification import NaiveBayes

In [52]:
nb = NaiveBayes()

In [53]:
from pyspark.ml import Pipeline

In [54]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_numeric, token, stop_remove, count_vec, idf, clean_up])

In [55]:
cleaner = data_prep_pipe.fit(data)

In [56]:
clean_data = cleaner.transform(data)

In [60]:
clean_data = clean_data.select(['label', 'features'])

In [62]:
train_data, test_data = clean_data.randomSplit([.7,.3])

In [63]:
spam_detector = nb.fit(train_data)

In [70]:
test_results = spam_detector.transform(test_data)

In [65]:
test_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,7,8...|[-794.99329284923...|[1.0,5.0601312561...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1157.9901064674...|[1.0,1.2313078567...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-659.65634317950...|[1.0,2.2260287482...|       0.0|
|  0.0|(13424,[0,1,12,33...|[-450.32106693743...|[1.0,2.3326598184...|       0.0|
|  0.0|(13424,[0,1,14,18...|[-1363.7670320217...|[1.0,8.4731730267...|       0.0|
|  0.0|(13424,[0,1,14,31...|[-216.41799955703...|[1.0,5.3591285686...|       0.0|
|  0.0|(13424,[0,1,14,79...|[-707.02978757946...|[1.0,2.8351413823...|       0.0|
|  0.0|(13424,[0,1,15,20...|[-671.96231242256...|[1.0,3.5681269702...|       0.0|
|  0.0|(13424,[0,1,21,27...|[-757.66686386345...|[1.0,3.3317183043...|       0.0|
|  0.0|(13424,[0

In [73]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [74]:
acc_eval = MulticlassClassificationEvaluator()

In [75]:
acc = acc_eval.evaluate(test_results)

0.9182349264211297

## Part 3: Streaming Data with PySpark