<a href="https://colab.research.google.com/github/vinods03/deep_learning/blob/main/NLP_15_Spotify_Reviews_using_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 35.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=0202f3e20803ddce6406ac3b0ad0c43b534c7a02746063cd9c1e88b847d90097
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Spotify Reviews').getOrCreate()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = spark.read.csv('/content/drive/MyDrive/spotify_reviews.csv', header = True)

In [6]:
df = df[['Review','Rating','Total_thumbsup']]

In [7]:
df.show()

+--------------------+------+--------------+
|              Review|Rating|Total_thumbsup|
+--------------------+------+--------------+
|Great music servi...|     5|             2|
|Please ignore pre...|     5|             1|
|"This pop-up ""Ge...|     4|             0|
|Really buggy and ...|     1|             1|
|Dear Spotify why ...|     1|             1|
|The player contro...|     3|             7|
|I love the select...|     5|             0|
|"Still extremely ...|     3|            16|
|It's a great app ...|     5|             0|
|I'm deleting this...|     1|           318|
|Love Spotify, and...|     2|             1|
|Can't play Spotif...|     1|             1|
|I had amazon prem...|     1|             7|
|Worst app always ...|     1|             1|
|i hav any music t...|     5|             0|
|Improve the IA to...|     5|             0|
|Android user - th...|     2|             0|
|I can't listen to...|     3|             0|
|It always crashin...|     1|             0|
|I know ad

In [8]:
df.describe()

DataFrame[summary: string, Review: string, Rating: string, Total_thumbsup: string]

In [9]:
# this is not required actually because either we are assigning different values to the rating and we are not using thumbsup / length

from pyspark.sql.types import IntegerType
df = df.withColumn('Rating_int',df['Rating'].cast(IntegerType()))
df = df.withColumn('Total_thumbsup_int',df['Total_thumbsup'].cast(IntegerType()))

from pyspark.sql.functions import length
df = df.withColumn('length', length(df['Review'])) 

In [10]:
df = df.drop('Rating','Total_thumbsup')

In [11]:
df.show()

+--------------------+----------+------------------+------+
|              Review|Rating_int|Total_thumbsup_int|length|
+--------------------+----------+------------------+------+
|Great music servi...|         5|                 2|   112|
|Please ignore pre...|         5|                 1|    86|
|"This pop-up ""Ge...|         4|                 0|   110|
|Really buggy and ...|         1|                 1|    47|
|Dear Spotify why ...|         1|                 1|   100|
|The player contro...|         3|                 7|   114|
|I love the select...|         5|                 0|    83|
|"Still extremely ...|         3|                16|   356|
|It's a great app ...|         5|                 0|   210|
|I'm deleting this...|         1|               318|   443|
|Love Spotify, and...|         2|                 1|   293|
|Can't play Spotif...|         1|                 1|    31|
|I had amazon prem...|         1|                 7|   430|
|Worst app always ...|         1|       

In [12]:
df.count()

61594

In [13]:
df = df.dropna()

In [14]:
df.count()

60716

In [15]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF, CountVectorizer, StringIndexer, VectorAssembler

In [16]:
tokenizer = Tokenizer(inputCol = 'Review', outputCol = 'words')

In [17]:
remover = StopWordsRemover(inputCol = 'words', outputCol = 'required_words')

In [18]:
vectorizer = CountVectorizer(inputCol = 'required_words', outputCol = 'c_vec')

In [19]:
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')

In [20]:
assembler = VectorAssembler(inputCols = ['tf_idf'], outputCol = 'features')

In [21]:
from pyspark.ml import Pipeline

In [22]:
pipeline = Pipeline(stages = [tokenizer, remover, vectorizer, idf, assembler])

In [23]:
fit_data_in_pipeline = pipeline.fit(df)

In [24]:
transformed_df = fit_data_in_pipeline.transform(df)

In [25]:
transformed_df.show()

+--------------------+----------+------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              Review|Rating_int|Total_thumbsup_int|length|               words|      required_words|               c_vec|              tf_idf|            features|
+--------------------+----------+------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Great music servi...|         5|                 2|   112|[great, music, se...|[great, music, se...|(55344,[0,1,13,31...|(55344,[0,1,13,31...|(55344,[0,1,13,31...|
|Please ignore pre...|         5|                 1|    86|[please, ignore, ...|[please, ignore, ...|(55344,[0,25,71,2...|(55344,[0,25,71,2...|(55344,[0,25,71,2...|
|"This pop-up ""Ge...|         4|                 0|   110|["this, pop-up, "...|["this, pop-up, "...|(55344,[3,14,21,2...|(55344,[3,14,21,2...|(55344,[3,14,21,2...|
|Really bu

In [26]:
transformed_df.select('features', 'Rating_int').show()

+--------------------+----------+
|            features|Rating_int|
+--------------------+----------+
|(55344,[0,1,13,31...|         5|
|(55344,[0,25,71,2...|         5|
|(55344,[3,14,21,2...|         4|
|(55344,[18,19,181...|         1|
|(55344,[2,3,14,70...|         1|
|(55344,[0,17,66,1...|         3|
|(55344,[4,8,27,10...|         5|
|(55344,[3,7,25,29...|         3|
|(55344,[0,1,2,5,1...|         5|
|(55344,[0,1,7,12,...|         1|
|(55344,[0,8,24,40...|         2|
|(55344,[3,5,315],...|         1|
|(55344,[1,3,5,6,7...|         1|
|(55344,[0,4,48,62...|         1|
|(55344,[1,7,6005,...|         5|
|(55344,[1,2,3,21,...|         5|
|(55344,[25,70,114...|         2|
|(55344,[6,16,74,3...|         3|
|(55344,[5,18,40,4...|         1|
|(55344,[7,11,12,1...|         1|
+--------------------+----------+
only showing top 20 rows



In [27]:
# o -> Negative, 1 -> Positive

from pyspark.sql.functions import when
transformed_df = transformed_df.withColumn('label', when (transformed_df['Rating_int'] == 1, 0)
                                                   .when (transformed_df['Rating_int'] == 2, 0)
                                                   .when (transformed_df['Rating_int'] == 3, 0)
                                                   .when (transformed_df['Rating_int'] == 4, 1)
                                                   .when (transformed_df['Rating_int'] == 5, 1))

In [28]:
transformed_df = transformed_df.select('features', 'label')
transformed_df.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(55344,[0,1,13,31...|    1|
|(55344,[0,25,71,2...|    1|
|(55344,[3,14,21,2...|    1|
|(55344,[18,19,181...|    0|
|(55344,[2,3,14,70...|    0|
|(55344,[0,17,66,1...|    0|
|(55344,[4,8,27,10...|    1|
|(55344,[3,7,25,29...|    0|
|(55344,[0,1,2,5,1...|    1|
|(55344,[0,1,7,12,...|    0|
|(55344,[0,8,24,40...|    0|
|(55344,[3,5,315],...|    0|
|(55344,[1,3,5,6,7...|    0|
|(55344,[0,4,48,62...|    0|
|(55344,[1,7,6005,...|    1|
|(55344,[1,2,3,21,...|    1|
|(55344,[25,70,114...|    0|
|(55344,[6,16,74,3...|    0|
|(55344,[5,18,40,4...|    0|
|(55344,[7,11,12,1...|    0|
+--------------------+-----+
only showing top 20 rows



In [29]:
training, test = transformed_df.randomSplit([0.8, 0.2])

In [30]:
from pyspark.ml.classification import LinearSVC

In [31]:
svc = LinearSVC()

In [32]:
review_detector = svc.fit(training)

In [33]:
predictions = review_detector.transform(test)

In [34]:
predictions.show()

+--------------------+-----+--------------------+----------+
|            features|label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|(55344,[0,1,2,3,4...|    0|[7.07572105492031...|       0.0|
|(55344,[0,1,2,3,4...|    0|[-2.9647129101412...|       1.0|
|(55344,[0,1,2,3,4...|    0|[8.95682025405653...|       0.0|
|(55344,[0,1,2,3,4...|    1|[-0.6787625863107...|       1.0|
|(55344,[0,1,2,3,4...|    0|[8.36492165405092...|       0.0|
|(55344,[0,1,2,3,4...|    0|[3.18988771869465...|       0.0|
|(55344,[0,1,2,3,4...|    0|[6.09874581402988...|       0.0|
|(55344,[0,1,2,3,4...|    0|[0.25351336713560...|       0.0|
|(55344,[0,1,2,3,4...|    0|[-7.2148914258111...|       1.0|
|(55344,[0,1,2,3,4...|    1|[-4.9444293063117...|       1.0|
|(55344,[0,1,2,3,4...|    1|[8.80412592720226...|       0.0|
|(55344,[0,1,2,3,4...|    0|[2.39605643437370...|       0.0|
|(55344,[0,1,2,3,4...|    1|[-3.8354304782033...|       1.0|
|(55344,[0,1,2,3,4...|  

In [35]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [36]:
eval = MulticlassClassificationEvaluator()

In [37]:
accuracy = eval.evaluate(predictions)

In [38]:
print(accuracy)

0.823067763055908
