<h5> Only required for juptyer </h5>

In [2]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [3]:
// General imports
import matplotlib.pyplot as pyplt
import numpy as np
from ggplot import *
from datetime import datetime

import pyspark.sql.functions as sf


<h4> Helper functions </h4>

In [5]:
def getInt(x):
    try: v = int(x)
    except: v = 0
    return v


def getDate(x):
    try: v = datetime.strptime(x,"%Y%m%d").date()
    except: v = datetime.strptime("19010101","%Y%m%d").date()
    return v

def nothing(x):
    return x

def getFloat(x):
    try: v = float(x)
    except: v = 0
    return v


# Rename columns of a dataframe using the array of new names
def renameCols(df,newNames):
    nDF = df
    assert(len(df.schema.names) == len(newNames))
    for i in range(0,len(df.schema.names)):
        nDF = nDF.withColumnRenamed(nDF.schema.names[i],newNames[i])
    return nDF

# Takes a list of functions, and applies thme to the row, 
# input: array of strings
# output: tuple ( types as determinted by the return type of the functions )
def getTupleFromSchema(l,colExtractors):
   assert(len(colExtractors) == len(l))
   return tuple ( [colExtractors[i](l[i]) for i in range(0,len(l))] )


In [6]:
def getDF(fname,colExtractors):
   # read the file
   inRDD = sc.textFile(fname)

    # get the schema from the first row
   schema = ''.join(inRDD.take(1)).split(',')
    
   # leave out the first row (assumed column)
   allRowsExceptFirst = inRDD.zipWithIndex().filter(lambda r: r[1] > 0 ).map(lambda r: r[0])
   
   # create the RDD of interest, mapping the colums as we go. 
   tRDD = allRowsExceptFirst.map(lambda l: l.replace(' ','').split(','))\
              .map(lambda l: getTupleFromSchema(l, colExtractors))
    
    # Rename columns based on schema
   tDF = renameCols(tRDD.toDF(),schema)
   return tDF

<h1> Create dataframes here </h1>

<h4> Setup data sources </h4>
<h6> Steps to follow </h6>
<ul> 
  <li> Upload files to S3 bucket
  <li> Specify the configuration below and mount it
</ul>

In [9]:
# Replace with your values
#
# NOTE: Set the access to this notebook appropriately to protect the security of your keys.
# Or you can delete this cell after you run the mount command below once successfully.
ACCESS_KEY = "REPLACE_WITH_YOUR_ACCESS_KEY"
SECRET_KEY = "REPLACE_WITH_YOUR_SECRET_KEY"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "REPLACE_WITH_YOUR_S3_BUCKET"
MOUNT_NAME = "REPLACE_WITH_YOUR_MOUNT_NAME"

dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)

In [10]:
# Data files
members_data           = '/mnt/kkboxmount/members.csv'
sample_submission_zero_data = '/mnt/kkboxmount/sample_submission_zero.csv'
train_data             = '/mnt/kkboxmount/train.csv'
transactions_data      = '/mnt/kkboxmount/transactions.csv'
user_logs_part_data    = '/mnt/kkboxmount/user_logs_part.csv'
user_logs_data         = '/mnt/kkboxmount/user_logs.csv.gz'

<h5> Get a clean memDF ("DF of members") </h5>

In [12]:
memDF = getDF(members_data,[nothing,getInt,getInt,nothing,getInt,getDate,getDate])
memDF.show(5)

<h5> Get a clean sampleSubZeroDF ("DF of Sample Submission") </h5>

In [14]:
sampleSubZeroDF = getDF(sample_submission_zero_data,[nothing,getInt])
sampleSubZeroDF.show(5)

<h5> Get a clean trainDF ("DF of train") </h5>

In [16]:
trainDF = getDF(train_data,[nothing,getInt])
trainDF.show(5)

<h5> Get a clean transactionsDF ("DF of transactions") </h5>

In [18]:
transactionsDF = getDF(transactions_data,[nothing,getInt,getInt,getInt,getInt,getInt,getDate,getDate,getInt])
transactionsDF.show(3)

In [19]:
userLogPartDF = getDF(user_logs_part_data,[nothing,getDate,getInt,getInt,getInt,getInt,getInt,getInt,getFloat])
userLogPartDF.show(3)

In [20]:
userLogAllDF = getDF(user_logs_data,[nothing,getDate,getInt,getInt,getInt,getInt,getInt,getInt,getFloat])
userLogAllDF.show(3)

<h3> Lets understand each of the data sets </h3>

<h4> Lets start with members </h4>

In [23]:
memDF.describe().show()

In [24]:
# lets create a 5% sample and evaulate it
memDFsamp = memDF.sample(True,0.05)

In [25]:
memDFsamp.freqItems(['city','bd','gender','registered_via'],0.2).show()

In [26]:
for i in ['gender']:
    print (memDFsamp.select(memDFsamp[i]).distinct().show())

<h5> We find </h5>
<ul> 
  <li> 22 possible Cities. Most of them are one of 5 (5,22,4,13,1)
  <li> bd (age), seems to have a wide spread from -69988 to 2015
  <li> gender has blank fields
</ul>

<h4> Lets look at transactions </h4>

In [29]:
transactionsDF.describe().show()

In [30]:
transactionsDFsamp = transactionsDF.sample(True,0.05)
transactionsDFsamp.freqItems(['payment_plan_days','plan_list_price','actual_amount_paid'],0.2).show()

<h5> We find </h5>
<ul> 
  <li> Transaction data seems relatively clean
  <li> All values seem within bounds
</ul>

<h4> Lets look at user logs </h4>

In [33]:
userLogDF.describe().show()

In [34]:
userLogDF.show(10)

<h5> We find </h5>
<ul> 
  <li> User logs data seems relatively clean
  <li> Looks like a sparse matrix
  <li> The 100% played column has better mean and std deviation
  <li> There seem to be some major min & max values
</ul>

<h4> Lets plot some training data </h4>

In [37]:
d = trainDF.groupby(['is_churn']).count()
d.show()

In [38]:
ggplot(aes(x='is_churn'),data=trainDF.toPandas()) + geom_bar() + ggtitle("Training Data - churn")

In [39]:
d = memDF.groupby(['gender']).count()
d.show()

In [40]:
pyplot.plot(d.select('gender').collect(),d.select('count').collect())

In [41]:
genderData = d.select('gender').collect()

In [42]:
countData = d.select('count').collect()

In [43]:
type(genderData[0])

In [44]:
pyplt.figure(1, figsize=(9, 3))
#pyplt.bar(genderData,countData)
pyplt.show()