In [37]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import split,regexp_extract,col,when,desc
import logging

In [38]:
logger=logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter=logging.Formatter('%(asctime)s:%(created)f:%(filename)s:%(message)s:%(message)s')
file_handler=logging.FileHandler('EU5_case1.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)

In [41]:
conf=SparkConf().setAppName('apple')
sc=SparkContext(conf=conf)
sql=SQLContext(sc)

# Read csv file into sparkdataframe

In [42]:
two=sql.read.options(header=True,inferschema=True).csv('hdfs://nameservice1/user/edureka_37986/AppleStore.csv')

# Convert mb into GB

In [43]:
three=two.withColumn('GB',(two.size_bytes/1000))

# rename columns to remove dot

In [44]:
three=three.withColumnRenamed('lang.num','language_number').withColumnRenamed('sup_devices.num','sup_devices_number').withColumnRenamed('ipadSc_urls.num','ipadSc_urls_num')

In [54]:
three=three.withColumn('lang_flag',when(col('language_number') >=1,1).otherwise(0))
three=three.withColumn('user_flag',when(col('user_rating')>=3,'High rated').otherwise('Low rated'))

# List top 10 apps

In [58]:
three.registerTempTable('t')

In [46]:
t2=sql.sql('select track_name,user_rating,sup_devices_number from t order by user_rating desc')

In [35]:
t2.show(10,truncate=False)

+----------------------------------------------------------------------------------+-----------+------------------+
|track_name                                                                        |user_rating|sup_devices_number|
+----------------------------------------------------------------------------------+-----------+------------------+
|King of Dragon Pass                                                               |5.0        |43                |
|Learn English quickly with MosaLingua                                             |5.0        |38                |
|TurboScan™ Pro - document & receipt scanner: scan multiple pages and photos to PDF|5.0        |38                |
|The Photographer's Ephemeris                                                      |5.0        |37                |
|▻Sudoku +                                                                         |5.0        |40                |
|:) Sudoku +                                                            

# The difference in the average number of screenshots displayed of highest and lowest rating apps - ## i have counted the instances as i didnt find the variable for screenshots

In [59]:
t3=sql.sql('select user_flag,count(*)as count from t group by user_flag')

In [60]:
t3.show(5)

+----------+-----+
| user_flag|count|
+----------+-----+
|High rated| 5866|
| Low rated| 1331|
+----------+-----+



# What percentage of high rated apps support multiple languages

In [74]:
t4=sql.sql('select track_name,(sum(lang_flag)/count(user_flag)*100) as percent from t group by track_name order by percent desc')

In [75]:
t4.show(10,truncate=False)

+--------------------------------------------------------------------+-------+
|track_name                                                          |percent|
+--------------------------------------------------------------------+-------+
|Talking Tom Cat for iPad                                            |100.0  |
|Catan HD                                                            |100.0  |
|Racing Penguin Free - Top Flying and Diving Game                    |100.0  |
|Poshmark: Buy & Sell Fashion                                        |100.0  |
|Mystery Case Files: 13th Skull HD - A Hidden Object Adventure (Full)|100.0  |
|Offroad Legends                                                     |100.0  |
|聚美优品-新用户专享160元现金券                                                   |100.0  |
|Stick Texting Emoji Emoticons Killer                                |100.0  |
|100 PICS Quiz - guess the picture trivia games                      |100.0  |
|Red－shop the world                                 