<h5> Description </h5>
<ol>
 <li> Reads the parquet files created earlier and presents some basic statistics
</ol>

In [2]:
#uncomment for jupyter notebook
#import findspark
#findspark.init()

#import pyspark
#sc = pyspark.SparkContext()
#spark = pyspark.sql.SparkSession(sc)



# General imports
import matplotlib.pyplot as pyplt
import numpy as np
from ggplot import *
from datetime import datetime
import pyspark.sql.functions as sf
from pyspark.sql import SQLContext

# Tables files
members_table           = 'unmgmt_members_table'
sample_submission_zero_table = 'unmgmt_sample_submission_zero_table'
train_table             = 'unmgmt_train_table'
transactions_table      = 'unmgmt_transactions_table'
user_logs_part_table    = 'unmgmt_user_logs_part_table'
user_logs_all_table       = 'unmgmt_user_logs_all_table'



s = SQLContext(sc)

<h3> Read optimized dataframes </h3>

In [4]:
memDF           = s.sql("select * from " + members_table)
sampleSubZeroDF = s.sql("select * from " + sample_submission_zero_table)
trainDF         = s.sql("select * from " + train_table)
transactionsDF  = s.sql("select * from " + transactions_table)
userLogPartDF   = s.sql("select * from " + user_logs_part_table)
userLogAllDF    = s.sql("select * from " + user_logs_all_table)

<h3> Lets understand each of the data sets </h3>

<h4> Lets start with members </h4>

In [7]:
memDF.describe().show()

In [8]:
# lets create a 5% sample and evaulate it
memDFsamp = memDF.sample(True,0.05)

In [9]:
memDFsamp.freqItems(['city','bd','gender','registered_via'],0.2).show()

In [10]:
for i in ['gender']:
    print (memDFsamp.select(memDFsamp[i]).distinct().show())

<h5> We find </h5>
<ul> 
  <li> 22 possible Cities. Most of them are one of 5 (5,22,4,13,1)
  <li> bd (age), seems to have a wide spread from -69988 to 2015
  <li> gender has blank fields
</ul>

<h4> Lets look at transactions </h4>

In [13]:
transactionsDF.show(1)

In [14]:
transactionsDF.describe().show()

In [15]:
transactionsDFsamp = transactionsDF.sample(True,0.05)
transactionsDFsamp.freqItems(['payment_plan_days','plan_list_price','actual_amount_paid'],0.2).show()

In [16]:
# group transactions by user
transByUser = transactionsDF.groupby(['msno'])

In [17]:
# lets see how many transactions are we seeing per user
transByUser.count().sort(['count'],ascending = False).show()

<h5> We find </h5>
<ul> 
  <li> Transaction data seems relatively clean
  <li> All values seem within bounds
</ul>

<h4> Lets look at user logs </h4>

In [20]:
userLogPartDF.describe().show()

In [21]:
userLogAllDF.show(1)

In [22]:
userLogPartDF.show(10)

<h5> Find number of users with more than one occurance on a particular day </h5>

In [24]:
userLogins = userLogPartDF.groupby(['msno','date'])

In [25]:
userLogins.count().sort(['count'],ascending = True).show()

<h5> We find </h5>
<ul> 
  <li> User logs data seems relatively clean
  <li> There are no duplicate entries - seems like each user logs in only once a day
  <li> Looks like a sparse matrix
  <li> The 100% played column has better mean and std deviation
  <li> There seem to be some major min & max values
</ul>

<h4> Lets plot some training data </h4>

In [28]:
d = trainDF.groupby(['is_churn']).count()
d.show()

In [29]:
display(ggplot(aes(x='is_churn'),data=trainDF.toPandas()) + geom_bar() + ggtitle("Training Data - churn"))

In [30]:
d = memDF.groupby(['gender']).count()
d.show()