In [1]:
from google.colab import files
uploaded = files.upload()

Saving IMDB Dataset.csv to IMDB Dataset.csv


In [2]:
import pandas as pd
df = pd.read_csv("IMDB Dataset.csv", encoding='utf-8', on_bad_lines='skip')
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
!pip install pyspark




In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("IMDB Big Data Analysis") \
    .getOrCreate()


In [5]:
df = spark.read.csv("IMDB Dataset.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)


root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
+--------------------+--------------------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import length

df = df.withColumn("review_length", length(df["review"]))
df.select("review", "sentiment", "review_length").show(5)


+--------------------+--------------------+-------------+
|              review|           sentiment|review_length|
+--------------------+--------------------+-------------+
|One of the other ...|            positive|         1761|
|"A wonderful litt...| not only is it w...|          433|
|"I thought this w...| but spirited you...|          724|
|Basically there's...|            negative|          748|
|"Petter Mattei's ...| power and succes...|          200|
+--------------------+--------------------+-------------+
only showing top 5 rows



In [7]:
df.groupBy("sentiment").count().show()


+--------------------+-----+
|           sentiment|count|
+--------------------+-----+
| ""Nightmare"" is...|    1|
| he really kills ...|    1|
| while others wil...|    1|
| ""La Noche del T...|    1|
|"" which apparent...|    1|
|"" and felt a lit...|    1|
|"" I think you wi...|    1|
| a Spanish motion...|    1|
| which has turned...|    1|
| I suggest giving...|    1|
| they come up wit...|    1|
| one of the few h...|    1|
| while a guy who ...|    1|
| this is another ...|    1|
| Andres is snoozi...|    1|
|     you have chases|    1|
| or other caper/h...|    1|
| then this film i...|    1|
| Israel is one of...|    1|
| a bunch of lonel...|    1|
+--------------------+-----+
only showing top 20 rows



In [8]:
df.groupBy("sentiment").avg("review_length").show()


+--------------------+------------------+
|           sentiment|avg(review_length)|
+--------------------+------------------+
| ""Nightmare"" is...|             100.0|
| he really kills ...|             380.0|
| while others wil...|              74.0|
| ""La Noche del T...|             407.0|
|"" which apparent...|             554.0|
|"" and felt a lit...|              80.0|
|"" I think you wi...|             296.0|
| a Spanish motion...|             406.0|
| which has turned...|              91.0|
| I suggest giving...|             514.0|
| they come up wit...|             481.0|
| one of the few h...|             419.0|
| while a guy who ...|            1005.0|
| this is another ...|             347.0|
| Andres is snoozi...|             497.0|
|     you have chases|             320.0|
| or other caper/h...|              51.0|
| then this film i...|            2072.0|
| Israel is one of...|             585.0|
| a bunch of lonel...|             348.0|
+--------------------+------------