In [None]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = spark.read.csv('AppleStore.csv',header=True,inferSchema=True,nullValue='')

### 2.2 Describe the data

In [None]:
# Displays the content of the DataFrame to stdout
df.show()

In [None]:
#1)count the number of rows 
df.count()

In [None]:
#count the number of columns 
len(df.columns)

In [None]:
#2) value types
df.dtypes

In [None]:
# Print the schema in a tree format
df.printSchema()

In [None]:
#3) coding schemme
df = df.withColumnRenamed('sup_devices.num','sup_devices_num')

In [None]:
df = df.withColumnRenamed('ipadSc_urls.num','ipadSc_urls_num')

In [None]:
df = df.withColumnRenamed('lang.num','lang_num')

In [None]:
#Import pandas 
import pandas as pd
pd = df.toPandas()
pd.head()

In [None]:
pd.describe()

### 2.3 Explore the data

In [None]:
grouped = pd[['size_bytes','user_rating']].groupby('size_bytes').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['price','user_rating']].groupby('price').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['rating_count_tot','user_rating']].groupby('rating_count_tot').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['rating_count_ver','user_rating']].groupby('rating_count_ver').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['user_rating_ver','user_rating']].groupby('user_rating_ver').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['ver','user_rating']].groupby('ver').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['ipadSc_urls_num','user_rating']].groupby('ipadSc_urls_num').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['lang_num','user_rating']].groupby('lang_num').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['vpp_lic','user_rating']].groupby('vpp_lic').mean()
grouped.plot(kind='hist')

In [None]:
grouped = pd[['currency','user_rating']].groupby('currency').mean()
grouped.plot(kind='hist')

In [None]:
f, ax = plt.subplots(2,2,figsize=(8,4))
vis1 = sns.distplot(pd["user_rating"],bins=10, ax= ax[0][0])
vis2 = sns.distplot(pd["lang_num"],bins=10, ax=ax[0][1])
#vis3 = sns.distplot(df["price"],bins=10, ax=ax[1][0])
#vis4 = sns.distplot(df["rating_count_tot"],bins=10, ax=ax[1][1])

In [None]:
vis5 = sns.boxplot(data = pd, x = "prime_genre", y = "user_rating")

In [None]:
#https://towardsdatascience.com/visualizing-data-with-pair-plots-in-python-f228cf529166
# Create the default pairplot
sns.pairplot(pd)

### 2.4 Verify the data quality

In [None]:
#Exploring Null Values
def count_nulls(df):
    null_counts = []          #make an empty list to hold our results
    for col in df.dtypes:     #iterate through the column data types we saw above, e.g. ('C0', 'bigint')
        cname = col[0]        #splits out the column name, e.g. 'C0'    
        ctype = col[1]        #splits out the column type, e.g. 'bigint'
        if ctype != 'string': #skip processing string columns for efficiency (can't have nulls)
            nulls = df.where( df[cname].isNull() ).count()
            result = tuple([cname, nulls])  #new tuple, (column name, null count)
            null_counts.append(result)      #put the new tuple in our result list
    return null_counts

null_counts = count_nulls(df)

In [None]:
null_counts

## Step 3. Data Preparation

### 3.1 Select the data

In [None]:
#exclude the first column: _c0
df.drop ('_c0')
df=df.drop ('_c0')
df.show()

In [None]:
#exclude id
df.drop('id').show()
df=df.drop('id')

### 3.2 Clean the data


In [None]:
#1) remove the whole role if there is missing data. There is no missing data as shown above.
#2) remove duplicate row
df.dropDuplicates().show()

### 3.3 Construct the data


In [None]:
df = df.drop ('_c0').dropDuplicates()

In [None]:
pd = df.toPandas()

## Step 4. Data Transformation

### 4.1 Reduce the data (correlation,select)

##### 1) Correlation

In [None]:
pd.corr()

###### 2) Select relevant features

In [None]:
#According to the oorrelation above,select the features which are related to the predictor
#the variable "id" and "sup_device_num" is dropped because the correlation is negative.
#df=df.select('size_bytes','price','rating_count_tot','rating_count_ver','user_rating','user_rating_ver','sup_device_num','ipadSc_urls_num','lang_num','vpp_lic')


### 4.2 Project the data

## Step 6. Data Mining Algorithm Selection

### 6.3 Build/Select appropriate model(s) and choose relevant parameter(s) 

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr_train,lr_test = df.randomSplit([0.7,0.3])
final_model = LogisticRegression()
fit_final = final_model.fit(lr_train)