# Welcome to Week 4

##### This document provides a running example of completing the Week 4 assignment.
##### A shorter version with fewer comments is available as script: sparkMLlibClustering.py
##### To run these commands in Cloudera VM: first run the setup script: setupWeek4.sh 
##### You can then copy paste these commands in pySpark. To open pySpark, refer to Slides: https://www.coursera.org/learn/machinelearningwithbigdata/supplement/GTFQ0/slides-module-2-lesson-3

In [4]:
import pandas as pd
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array

#### Read ad-clicks.csv file (Note: change the path to ad-clicks.csv to the location on your machine)

In [5]:
adclicksDF = pd.read_csv('/Users/aloksingh/Downloads/capstone/second/courseraDataSimulation/ad-clicks.csv')
adclicksDF = adclicksDF.rename(columns=lambda x: x.strip()) #remove whitespaces from headers
adclicksDF['adCount'] = 1 #each row is a single click, hence add extra column and make it =1

In [6]:
adclicksDF.head(n=5) #display first 5 lines

Unnamed: 0,timestamp,txID,userSessionid,teamid,userid,adID,adCategory,adCount
0,2016-05-23 15:52:24,6045,6020,81,1362,10,fashion,1
1,2016-05-23 15:56:20,6043,6029,71,38,19,movies,1
2,2016-05-23 15:58:11,6038,5721,23,766,19,movies,1
3,2016-05-23 16:00:45,6046,5877,60,999,18,movies,1
4,2016-05-23 16:04:28,6037,5891,111,1674,1,automotive,1


#### Read buy-clicks.csv file (Note: change the path to buy-clicks.csv to the location on your machine)

In [7]:
buyClicksDF = pd.read_csv('/Users/aloksingh/Downloads/capstone/second/courseraDataSimulation/buy-clicks.csv')
buyClicksDF = buyClicksDF.rename(columns=lambda x: x.strip()) #remove whitespaces from headers

In [8]:
buyClicksDF.head(n=5) #display first 5 lines

Unnamed: 0,timestamp,txID,userSessionid,team,userid,buyid,price
0,2016-05-23 16:19:14,6086,5692,115,2472,3,4.99
1,2016-05-23 16:19:14,6087,5690,33,2391,1,1.99
2,2016-05-23 16:49:14,6120,5693,6,2156,4,9.99
3,2016-05-23 16:49:14,6121,5689,69,502,0,0.99
4,2016-05-23 16:49:14,6122,5691,97,2198,2,2.99


#### Select 'userid' and 'price' 

In [9]:
userPurchases = buyClicksDF[['userid','price']] #select only userid and price
userPurchases.head(n=5)

Unnamed: 0,userid,price
0,2472,4.99
1,2391,1.99
2,2156,9.99
3,502,0.99
4,2198,2.99


#### Select  'userid' and 'adCount' 

In [10]:
useradClicks = adclicksDF[['userid','adCount']]
useradClicks.head(n=5)

Unnamed: 0,userid,adCount
0,1362,1
1,38,1
2,766,1
3,999,1
4,1674,1


#### Perform aggregation to get total ad-clicks per user (as  a feature / attribute)

In [11]:
adsPerUser = useradClicks.groupby('userid').sum()
adsPerUser = adsPerUser.reset_index()
adsPerUser.columns = ['userid', 'totalAdClicks'] #rename the columns
adsPerUser.head(n=5)

Unnamed: 0,userid,totalAdClicks
0,3,33
1,4,44
2,9,43
3,17,32
4,20,40


#### Perform aggregation to get total revenue per user (as  a feature / attribute) generated per user

In [12]:
revenuePerUser = userPurchases.groupby('userid').sum()
revenuePerUser = revenuePerUser.reset_index()
revenuePerUser.columns = ['userid', 'revenue'] #rename the columns
revenuePerUser.head(n=5)

Unnamed: 0,userid,revenue
0,3,11.97
1,4,27.95
2,9,10.94
3,17,5.97
4,20,69.92


#### Let's merge these two tables to get one single table we can use for training

In [13]:
combinedDF = adsPerUser.merge(revenuePerUser, on='userid') #userid, adCount, price
combinedDF.head(n=5) #display how the merged table looks

Unnamed: 0,userid,totalAdClicks,revenue
0,3,33,11.97
1,4,44,27.95
2,9,43,10.94
3,17,32,5.97
4,20,40,69.92


In [15]:
trainingDF = combinedDF[['totalAdClicks','revenue']]
trainingDF.head(n=5)

Unnamed: 0,totalAdClicks,revenue
0,33,11.97
1,44,27.95
2,43,10.94
3,32,5.97
4,40,69.92


#### Display the dimension of the training data set

In [16]:
trainingDF.shape

(604, 2)

#### Remove userid before training and keep other two attributes

In [17]:
pDF = sqlContext.createDataFrame(trainingDF)
pDF.head(n=5)

[Row(totalAdClicks=33, revenue=11.97),
 Row(totalAdClicks=44, revenue=27.95),
 Row(totalAdClicks=43, revenue=10.940000000000001),
 Row(totalAdClicks=32, revenue=5.970000000000001),
 Row(totalAdClicks=40, revenue=69.92)]

In [20]:
parsedData = pDF.rdd.map(lambda line: array([line[0], line[1]])) #totalAdClicks, price (revenue)

#### Train KMeans model to create two clusters

In [22]:
clusters = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random")

#### Display the centers of two clusters

In [24]:
print(clusters.centers)

[array([ 41.28834356,  62.24104294]), array([ 25.22902494,  15.58024943])]


#### Analyze the cluster centers: 
#### One Cluster is centered at [ 41.2, 62.2 ]   ... (the 1st array)
#### Other Cluster is centered at [ 25.2, 15.5 ]   ... (the 2nd array)
#### In one cluster, players click on ads much more often and spend more money on in-app purchases