# Imports

In [1]:
import sys
import os

from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession

In [2]:
import os
print(os.environ['SPARK_CLASSPATH'])

/Applications/google-cloud-sdk/gcs-connector-hadoop2-1.9.17-javadoc.jar


In [3]:
app_name="Case Study 1"

conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf = conf)
spark = SparkSession(sc)

In [4]:
sc.applicationId

'local-1571526239445'

In [5]:
sc.setLogLevel("ERROR")
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("pyspark script logger initialized")

# Load csv into spark

In [10]:
def get_hdfs_filepath(file_name, on_cloud=True):
    # path to folder containing this code
    prefix = '/data/spark/5_cs1_dataset/'
    if on_cloud:
        bucket  = os.environ['BUCKET']
        file_path = bucket + prefix + file_name
    else:
        file_path = '/Users/val' + prefix + file_name

    return file_path

In [11]:
APP_STORE = get_hdfs_filepath('AppleStore.csv')
DESCRIPTIONS = get_hdfs_filepath('appleStore_description.csv')

In [12]:
!file 'data/appleStore_description.csv'

data/appleStore_description.csv: cannot open `data/appleStore_description.csv' (No such file or directory)


In [13]:
df_store = spark.read.csv(APP_STORE, multiLine=True, header="true",encoding='utf-8', escape= "\"",inferSchema =True)
df_desc = spark.read.csv(DESCRIPTIONS, multiLine=True, header="true",encoding='utf-8', escape= "\"",inferSchema =True)

df_store.count(), df_desc.count()

(7197, 7197)

# Parse the data as csv

In [14]:
df_store = df_store.drop('_c0')

In [15]:
cols = ['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices_num',
 'screenshots_num',
 'lang_num',
 'vpp_lic']

#rename columns containing period character
df_store = df_store.toDF(*cols)

In [16]:
df_store.schema.names

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices_num',
 'screenshots_num',
 'lang_num',
 'vpp_lic']

# Convert bytes to MB and GB in a new column

In [17]:
df_store = df_store.withColumn("MB", df_store['size_bytes']/1024).withColumn("GB", df_store['size_bytes']/1024/1024)

In [18]:
df_store.schema.names

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices_num',
 'screenshots_num',
 'lang_num',
 'vpp_lic',
 'MB',
 'GB']

In [19]:
print("\n3.\tConvert bytes to MB and GB in a new column\tDone!\n\n{}".format(df_store.toPandas().head(5)))


3.	Convert bytes to MB and GB in a new column	Done!

          id                                         track_name  size_bytes  \
0  281656475                                    PAC-MAN Premium   100788224   
1  281796108                          Evernote - stay organized   158578688   
2  281940292    WeatherBug - Local Weather, Radar, Maps, Alerts   100524032   
3  282614216  eBay: Best App to Buy, Sell, Save! Online Shop...   128512000   
4  282935706                                              Bible    92774400   

  currency  price  rating_count_tot  rating_count_ver  user_rating  \
0      USD   3.99             21292                26          4.0   
1      USD   0.00            161065                26          4.0   
2      USD   0.00            188583              2822          3.5   
3      USD   0.00            262241               649          4.0   
4      USD   0.00            985920              5320          4.5   

   user_rating_ver     ver cont_rating   prime_gen

In [20]:
df_store.toPandas().head(5).iloc[:,[0,1,2,-2,-1]]

Unnamed: 0,id,track_name,size_bytes,MB,GB
0,281656475,PAC-MAN Premium,100788224,98426.0,96.119141
1,281796108,Evernote - stay organized,158578688,154862.0,151.232422
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,98168.0,95.867188
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,125500.0,122.558594
4,282935706,Bible,92774400,90600.0,88.476562


# List top 10 trending apps

In [21]:
df_store.sort(df_store.rating_count_tot.desc()).limit(10).toPandas()

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices_num,screenshots_num,lang_num,vpp_lic,MB,GB
0,284882215,Facebook,389879808,USD,0.0,2974676,212,3.5,3.5,95.0,4+,Social Networking,37,1,29,1,380742.0,371.818359
1,389801252,Instagram,113954816,USD,0.0,2161558,1289,4.5,4.0,10.23,12+,Photo & Video,37,0,29,1,111284.0,108.675781
2,529479190,Clash of Clans,116476928,USD,0.0,2130805,579,4.5,4.5,9.24.12,9+,Games,38,5,18,1,113747.0,111.081055
3,420009108,Temple Run,65921024,USD,0.0,1724546,3842,4.5,4.0,1.6.2,9+,Games,40,5,1,1,64376.0,62.867188
4,284035177,Pandora - Music & Radio,130242560,USD,0.0,1126879,3594,4.0,4.5,8.4.1,12+,Music,37,4,1,1,127190.0,124.208984
5,429047995,Pinterest,74778624,USD,0.0,1061624,1814,4.5,4.0,6.26,12+,Social Networking,37,5,27,1,73026.0,71.314453
6,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1,90600.0,88.476562
7,553834731,Candy Crush Saga,222846976,USD,0.0,961794,2453,4.5,4.5,1.101.0,4+,Games,43,5,24,1,217624.0,212.523438
8,324684580,Spotify Music,132510720,USD,0.0,878563,8253,4.5,4.5,8.4.3,12+,Music,37,5,18,1,129405.0,126.37207
9,343200656,Angry Birds,175966208,USD,0.0,824451,107,4.5,3.0,7.4.0,4+,Games,38,0,10,1,171842.0,167.814453


# The difference in the average number of screenshots displayed of highest and lowest rating apps

In [22]:
import pyspark.sql.functions as F 

min_rating, max_rating = df_store.agg(F.min(df_store.user_rating),F.max(df_store.user_rating)).collect()[0]
min_rating, max_rating


(0.0, 5.0)

In [23]:
df1 = df_store

In [24]:
a = df1.filter(df1.user_rating == max_rating).agg(F.avg(df1.screenshots_num))
a.show()

+--------------------+
|avg(screenshots_num)|
+--------------------+
|  3.7134146341463414|
+--------------------+



In [25]:
b = df1.filter(df1.user_rating == min_rating).agg(F.avg(df1.screenshots_num))
b.show()

+--------------------+
|avg(screenshots_num)|
+--------------------+
|   2.582346609257266|
+--------------------+



In [26]:
diff = a.first()[0] - b.first()[0]
diff

1.1310680248890757

# What percentage of high rated apps support multiplelanguages

In [27]:
df1.filter(df1.lang_num > 1).filter(df1.user_rating == max_rating).count() * 100 / df1.filter(
    df1.user_rating == max_rating).count()

47.5609756097561

# How does app details contribute to user ratings?

In [28]:
# get percentiles of ratings
percentiles = df1.stat.approxQuantile("user_rating",[0.25,0.50,0.75],0.0)
percentiles

[3.5, 4.0, 4.5]

In [29]:
# get the datasets with different percentiles
df_25 = df1.filter(df1.user_rating<percentiles[0])
df_50 = df1.filter((df1.user_rating>=percentiles[0]) & (df1.user_rating<percentiles[1]))
df_75 = df1.filter((df1.user_rating>=percentiles[1]) & (df1.user_rating<percentiles[2]))
df_100 = df1.filter(df1.user_rating>=percentiles[2])

In [30]:
# compare statistics
q1 = df_25.agg(F.avg(df_25.lang_num))
q1.show()

+-----------------+
|    avg(lang_num)|
+-----------------+
|3.030338389731622|
+-----------------+



In [31]:
q2 = df_50.agg(F.avg(df_50.lang_num))
q2.show()

+-----------------+
|    avg(lang_num)|
+-----------------+
|5.777777777777778|
+-----------------+



In [32]:
q3 = df_75.agg(F.avg(df_75.lang_num))
q3.show()

+-----------------+
|    avg(lang_num)|
+-----------------+
|5.911439114391144|
+-----------------+



In [33]:
q4 = df_100.agg(F.avg(df_100.lang_num))
q4.show()

+----------------+
|   avg(lang_num)|
+----------------+
|6.41933438985737|
+----------------+



In [34]:
import pandas as pd
data = [q1.first()[0], q2.first()[0], q3.first()[0], q4.first()[0]]
pd.DataFrame(zip([25, 50, 75, 100],data), columns = ['percentile','avg_lang_num']).set_index('percentile')

Unnamed: 0_level_0,avg_lang_num
percentile,Unnamed: 1_level_1
25,3.030338
50,5.777778
75,5.911439
100,6.419334


# Compare the statistics of different app groups/genres

In [35]:
df1.groupBy("prime_genre").agg(F.avg(df1.lang_num),F.avg(df1.screenshots_num),F.avg(df1.rating_count_tot),F.avg(df1.MB)).toPandas()

Unnamed: 0,prime_genre,avg(lang_num),avg(screenshots_num),avg(rating_count_tot),avg(MB)
0,Education,7.205298,4.421634,2239.229581,176195.528652
1,Navigation,6.0,2.565217,11853.956522,100932.114534
2,Entertainment,4.528972,3.050467,7533.678505,99100.313497
3,Sports,4.140351,3.0,14026.929825,77024.973684
4,Food & Drink,3.571429,2.063492,13938.619048,75776.365079
5,Photo & Video,9.664756,2.95702,14352.280802,66915.922636
6,Travel,7.592593,2.592593,14129.444444,80768.469136
7,Finance,2.25,1.721154,11047.653846,76402.206336
8,Social Networking,9.125749,1.742515,45498.898204,77512.148011
9,Book,2.375,2.991071,5125.4375,174629.517857


# Does length of app description contribute to the ratings?

In [36]:
df2 = df_desc
df2.schema.names

['id', 'track_name', 'size_bytes', 'app_desc']

In [37]:
# Create new column for length of description
df2 = df2.withColumn("desc_len", F.length(df2.app_desc))


In [38]:
inner_join = df1.join(df2.select([df2.id, df2.app_desc, df2.desc_len]), 'id','outer')
inner_join.schema.names

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices_num',
 'screenshots_num',
 'lang_num',
 'vpp_lic',
 'MB',
 'GB',
 'app_desc',
 'desc_len']

In [39]:
df3 = inner_join
# Get percentiles of ratings
percentiles = df3.stat.approxQuantile("rating_count_tot",[0.25,0.50,0.75],0.0)

In [40]:
# Get the datasets with different percentiles
df_25 = df3.filter(df3.rating_count_tot < percentiles[0])
df_50 = df3.filter((df3.rating_count_tot >= percentiles[0]) & (df3.rating_count_tot < percentiles[1]))
df_75 = df3.filter((df3.rating_count_tot >= percentiles[1]) & (df3.rating_count_tot < percentiles[2]))
df_100 = df3.filter(df3.rating_count_tot >= percentiles[2])

In [41]:
# Compare the statistics
q1 = df_25.agg(F.avg(df_25.desc_len))
q1.show()
q2 = df_50.agg(F.avg(df_50.desc_len))
q2.show() 
q3 = df_75.agg(F.avg(df_75.desc_len))
q3.show()
q4 = df_100.agg(F.avg(df_100.desc_len))
q4.show()

+-----------------+
|    avg(desc_len)|
+-----------------+
|1057.452169076752|
+-----------------+

+------------------+
|     avg(desc_len)|
+------------------+
|1523.2256809338521|
+------------------+

+------------------+
|     avg(desc_len)|
+------------------+
|1727.3083333333334|
+------------------+

+------------------+
|     avg(desc_len)|
+------------------+
|1906.3022222222223|
+------------------+



In [42]:
percentiles

[28.0, 300.0, 2793.0]

In [43]:
import pandas as pd
data = [q1.first()[0], q2.first()[0], q3.first()[0], q4.first()[0]]
pd.DataFrame(zip([25, 50, 75, 100],data), columns = ['rating_percentile','avg_desc_len']).set_index('rating_percentile')

Unnamed: 0_level_0,avg_desc_len
rating_percentile,Unnamed: 1_level_1
25,1057.452169
50,1523.225681
75,1727.308333
100,1906.302222


In [44]:
df3 = inner_join
# Get percentiles of ratings
percentiles = df3.stat.approxQuantile("desc_len",[0.25,0.50,0.75],0.0)
# Get the datasets with different percentiles
df_25 = df3.filter(df3.desc_len < percentiles[0])
df_50 = df3.filter((df3.desc_len >= percentiles[0]) & (df3.desc_len < percentiles[1]))
df_75 = df3.filter((df3.desc_len >= percentiles[1]) & (df3.desc_len < percentiles[2]))
df_100 = df3.filter(df3.desc_len >= percentiles[2])
# Compare the statistics
q1 = df_25.agg(F.avg(df_25.rating_count_tot))
q2 = df_50.agg(F.avg(df_50.rating_count_tot))
q3 = df_75.agg(F.avg(df_75.rating_count_tot))
q4 = df_100.agg(F.avg(df_100.rating_count_tot))
# prnt results
data = [q1.first()[0], q2.first()[0], q3.first()[0], q4.first()[0]]
pd.DataFrame(zip([25, 50, 75, 100],data), columns = ['desc_len_percentile','avg_rating_count_tot']).set_index('desc_len_percentile')

Unnamed: 0_level_0,avg_rating_count_tot
desc_len_percentile,Unnamed: 1_level_1
25,7580.242762
50,9895.601888
75,16092.82657
100,17991.761799


In [45]:
percentiles

[737.0, 1366.0, 2190.0]