In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local[5]")
    .appName("Airport_Data_Project")
    .getOrCreate()
)

df = spark.read.csv("./googleplaystore.csv", header=True, inferSchema=True)

df.printSchema()
df.show(truncate=False)
df.rdd.repartition(4)
print(df.rdd.getNumPartitions())

24/02/17 14:39:40 WARN Utils: Your hostname, taushif-HP-Laptop-15s-fr2xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.37 instead (on interface wlo1)
24/02/17 14:39:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/17 14:39:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)

+--------------------------------------------------+--------------+------+-------+----+-----------+----+-----+--------------+-------------------------+------------------+------------------+------------+
|App                                               |Category      |Rating|Reviews|Size|Installs   |Type|Price|Content Rating|Genres                   |Last Updated      |Current Ver       |Android Ver |
+--------------------------------------------------+--------------+------

In [5]:
# Question 1 : Find the top 10 reviews given to the apps
from pyspark.sql import functions as func

Top_Reviews_Apps = (
    df.withColumn("Reviews", func.col("Reviews").cast("int"))
    .groupBy(func.col("App"))
    .agg(func.sum(func.col("Reviews")).alias("Total_Reviews"))
    .orderBy(func.desc(func.col("Total_Reviews")))
    .limit(10)
)

Top_Reviews_Apps.show(truncate=False)

[Stage 4:>                                                          (0 + 1) / 1]

+----------------------------------------+-------------+
|App                                     |Total_Reviews|
+----------------------------------------+-------------+
|Instagram                               |266241989    |
|WhatsApp Messenger                      |207348304    |
|Clash of Clans                          |179558781    |
|Messenger – Text and Video Chat for Free|169932272    |
|Subway Surfers                          |166331958    |
|Candy Crush Saga                        |156993136    |
|Facebook                                |156286514    |
|8 Ball Pool                             |99386198     |
|Clash Royale                            |92530298     |
|Snapchat                                |68045010     |
+----------------------------------------+-------------+



                                                                                

In [14]:
# Question 2 : Top 10 installed apps distribution of type (free or paid)
from pyspark.sql import functions as func

Top_Installed_App = (
    df.withColumn("Installs", func.regexp_replace(func.col("Installs"), "[^0-9]", ""))
    .withColumn("Installs", func.col("Installs").cast("int"))
    .groupBy(func.col("App"))
    .agg(func.sum(func.col("Installs")).alias("Total_No_Of_Installation"))
    .orderBy(func.desc(func.col("Total_No_Of_Installation")))
    .limit(10)
)

Top_Installed_App.show(truncate=False)

[Stage 10:>                                                         (0 + 1) / 1]

+----------------------------------------+------------------------+
|App                                     |Total_No_Of_Installation|
+----------------------------------------+------------------------+
|Subway Surfers                          |6000000000              |
|Instagram                               |4000000000              |
|Hangouts                                |4000000000              |
|Google Drive                            |4000000000              |
|Google News                             |4000000000              |
|Google Photos                           |4000000000              |
|Candy Crush Saga                        |3500000000              |
|WhatsApp Messenger                      |3000000000              |
|Messenger – Text and Video Chat for Free|3000000000              |
|Google Chrome: Fast & Secure            |3000000000              |
+----------------------------------------+------------------------+



                                                                                

In [15]:
# Question 3 : Category-wise distribution of installed Apps

All_Category_Of_Apps = df.select("Category").distinct()

All_Category_Of_Apps.show(truncate=False)

+-------------------+
|Category           |
+-------------------+
|EVENTS             |
|COMICS             |
|SPORTS             |
|WEATHER            |
|VIDEO_PLAYERS      |
|AUTO_AND_VEHICLES  |
|PARENTING          |
|ENTERTAINMENT      |
|PERSONALIZATION    |
|HEALTH_AND_FITNESS |
|TRAVEL_AND_LOCAL   |
|BOOKS_AND_REFERENCE|
|FOOD_AND_DRINK     |
|PHOTOGRAPHY        |
|BUSINESS           |
|FAMILY             |
|SHOPPING           |
|HOUSE_AND_HOME     |
|GAME               |
|Face               |
+-------------------+
only showing top 20 rows



In [23]:
# Question 4 : Top paid Apps

Top_Paid_Apps = (
    df.filter(func.col("Type") == "Paid")
    .withColumn("Installs", func.regexp_replace(func.col("Installs"), "[^0-9]", ""))
    .withColumn("Installs", func.col("Installs").cast("int"))
    .groupBy(func.col("App"), func.col("Type"))
    .agg(func.sum(func.col("Installs")).alias("Total_No_Of_Installation"))
    .orderBy(func.desc(func.col("Total_No_Of_Installation")))
    .limit(10)
)

Top_Paid_Apps.show(truncate=False)

+--------------------------+----+------------------------+
|App                       |Type|Total_No_Of_Installation|
+--------------------------+----+------------------------+
|Minecraft                 |Paid|20000000                |
|Hitman Sniper             |Paid|10000000                |
|Facetune - For Free       |Paid|3000000                 |
|Beautiful Widgets Pro     |Paid|2000000                 |
|HD Widgets                |Paid|2000000                 |
|True Skate                |Paid|1000000                 |
|Tasker                    |Paid|1000000                 |
|Card Wars - Adventure Time|Paid|1000000                 |
|Toca Life: City           |Paid|1000000                 |
|Bloons TD 5               |Paid|1000000                 |
+--------------------------+----+------------------------+



In [26]:
# Question 5 : Top paid rating apps

Top_Rated_Paid_Apps = (
    df.filter(func.col("Type") == "Paid")
    .withColumn("Rating", func.col("Rating").cast("int"))
    .groupBy(func.col("App"), func.col("Type"))
    .agg(func.avg(func.col("Rating")).alias("Avg_Rating"))
    .orderBy(func.desc("Avg_Rating"))
    .limit(10)
)

Top_Rated_Paid_Apps.show(truncate=False)

+----------------------------------------+----+----------+
|App                                     |Type|Avg_Rating|
+----------------------------------------+----+----------+
|P-Home for KLWP                         |Paid|5.0       |
|Chronolink DX                           |Paid|5.0       |
|USMLE Step 2 CK Flashcards              |Paid|5.0       |
|FHR 5-Tier 2.0                          |Paid|5.0       |
|AP Art History Flashcards               |Paid|5.0       |
|Mu.F.O.                                 |Paid|5.0       |
|FO Bixby                                |Paid|5.0       |
|Morse Player                            |Paid|5.0       |
|Super Hearing Secret Voices Recorder PRO|Paid|5.0       |
|Android P Style Icon Pack               |Paid|5.0       |
+----------------------------------------+----+----------+

