# Music Box Churn Prediction and Recommendation using Spark

# Using Spark to generate features

## 1. Load data into Spark dataframe

In [None]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# sc = SparkContext('local')
# spark = SparkSession(sc)
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv('data/events.csv', header=True).cache()
df # Show column names and types of dataframe

In [None]:
df.show() # default show 20

#### Convert type of date column from `string` to `date`

In [None]:
df = df.withColumn('date', F.col('date').cast('date'))
df # notice the type of date column is date now

In [None]:
df.show(5)

## 2. Exploratory data analysis

#### Count number of rows

```SQL
SELECT COUNT(*)
FROM event_ds_table;
```

In [None]:
df.count()

There are 11,264,316 rows in the table

#### Count distinct `uid`

```SQL
SELECT COUNT(DISTINCT(uid))
FROM event_ds_table;
```

In [None]:
df.select('uid').distinct().count()

There are 51,637 users.

#### Group by

```SQL
SELECT COUNT(*)
FROM events_ds_table
GROUP BY event;
```

In [None]:
df.groupBy('event').count().show()

```SQL
SELECT COUNT(uid) as count, MAX(uid) as max_uid
FROM event_ds_table
GROUP BY event;
```

In [None]:
df.groupBy('event').agg(
    F.count(F.col('uid')).alias('count'),
    F.max(F.col('uid')).alias('max_uid')
).show()

#### Filter
```SQL
SELECT date, event, count(*)
FROM event_ds_table
WHERE date > '2017-04-01' AND date < '2017-04-10'
GROUP BY date, event
ORDER BY date, event
```

In [None]:
df.filter((F.col('date') > '2017-04-01') & 
          (F.col('date') <= '2017-04-10')).groupBy('date', 'event').count().orderBy('date', 'event').show()

#### Export to pandas

In [None]:
date_count = df.groupBy('date').count().toPandas()

In [None]:
date_count.head()

In [None]:
date_count.info()

In [None]:
date_count['date'].max()

In [None]:
date_count['date'].min()

Because I use `.groupBy('date')`, there are 44 days between 2017/3/30 to 2017/5/12. Therefore, the entries are 44.

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
date_count.set_index('date').sort_values(by='date', ascending=True).plot.bar(ax=ax)
plt.show()

## 3. Define labels

The date is starting form 2017/3/30 to 2017/5/12. Total 44 days. Use the first 30 days as feature. and the last 14 days as labels.
* Feature window: 2017/3/30 ~ 2017/4/28
* Label window: 2017/4/29 ~ 2017/5/12

In [None]:
import datetime
from dateutil import parser

label_window_size = 14
label_window_end_date = parser.parse('2017-05-12').date()
label_window_start_date = label_window_end_date - datetime.timedelta(label_window_size - 1)
print('label window: ', label_window_start_date, '~', label_window_end_date, ' days:', label_window_size)

In [None]:
feature_window_size = 30
feature_window_end_date = label_window_start_date - datetime.timedelta(1)
feature_window_start_date = feature_window_end_date - datetime.timedelta(feature_window_size - 1)
print('Feature window: ', feature_window_start_date, '~', feature_window_end_date, ' days:', feature_window_size)

Select unique `uid` in feature window (2017/3/30 ~ 2017/4/28).

In [None]:
df_model_uid = df.filter((F.col('date') >= feature_window_start_date) & 
                         (F.col('date') <= feature_window_end_date)).select('uid').distinct()

In [None]:
df_model_uid.show(5)

In [None]:
df_model_uid.count()

There are 51,637 users in 2017/3/30 to 2017/5/12 time window. And only 50,230 users in 2017/3/30 to 2017/4/28 feature window.

#### Set active user labels

Define active users if they are activate in label window (2017/4/29 ~ 2017/5/12).
* Active label = 0
* Churn label = 1

In [None]:
df_active_uid_in_label_window = df.filter((F.col('date') >= label_window_start_date) & 
                                          (F.col('date') <= label_window_end_date))\
                                  .select('uid').distinct().withColumn('label', F.lit(0))

In [None]:
df_label = df_model_uid.join(df_active_uid_in_label_window, on='uid', how='left')
df_label = df_label.fillna(1)

In [None]:
df_label.groupBy('label').count().show()

In [None]:
df_label.show()

In [None]:
df_label.count()

In [None]:
df_label.distinct().count()

## 4. Feature generation

### Events in feature window

Feature window: 2017/3/30 ~ 2017/4/28

In [None]:
df_feature_window = df.filter((F.col('date') >= feature_window_start_date) & 
                              (F.col('date') <= feature_window_end_date))

### Frequency features

#### Method1

#### Define function to generate frequency features

In [None]:
def frequency_feature_generation(df, event, time_window, snapshot_date):
    '''
    Generate frequency features for one event type and one time window
    '''
    df_feature = df.filter(F.col('event') == event)\
                   .filter((F.col('date') >= snapshot_date - datetime.timedelta(time_window - 1)) &
                           (F.col('date') <= snapshot_date))\
                   .groupBy('uid').agg(F.count(F.col('uid')).alias('freq_' + event + '_last_' + str(time_window)))
    return df_feature

#### Generate one feature

Select `uid` in 2017/4/19 ~ 2017/4/28 with `event='S'` and count.

In [None]:
event = 'S'
time_window = 10
snapshot_date = feature_window_end_date

df_feature = frequency_feature_generation(df_feature_window, event, time_window, snapshot_date)

In [None]:
df_feature.show(5)

This table counts the searching times for each user in the last 10 days of feature window (2017/4/19 to 2017/4/28).

#### Generate frequency features

Select `uid` and count.

|Time window | Date       |
|-----------:|-----------:|
|      1 day |        4/28|
|     3 days | 4/26 ~ 4/28|
|     7 days | 4/22 ~ 4/28|
|    14 days | 4/15 ~ 4/28|
|    30 days | 3/30 ~ 4/28|

In [None]:
event_list = ['P', 'S', 'D']
time_window_list = [1, 3, 7, 14, 30]
df_feature_list = []

for event in event_list:
    for time_window in time_window_list:
        df_feature_list.append(frequency_feature_generation(df_feature_window, event, time_window, snapshot_date))

In [None]:
df_feature_list

In [None]:
for feature_list in df_feature_list:
    feature_list.show(5)
    print('\n')

Using `frequency_feature_generation()`, I got 15 tables. However, I would like to have a single table instead of 15 separated tables. The Method 2 can get 3 single tables, one for each event.

#### Method 2

Use `when().otherwise()`

`*[]` opens list and makes them comma separated

In [None]:
def frequency_feature_generation_time_window(df, event, time_window_list, snapshot_date):
    '''
    Generate frequency features for one event type and a list of time window
    '''
    df_feature = df.filter(F.col('event') == event)\
                   .groupBy('uid')\
                   .agg(*[F.sum(F.when(
                                        (F.col('date') >= snapshot_date - datetime.timedelta(time_window - 1)) &
                                        (F.col('date') < snapshot_date)
                                   , 1).otherwise(0)\
                               ).alias('freq_' + event + '_last_' + str(time_window)) \
                          for time_window in time_window_list])
    return df_feature

#### Generate one event type for all time window

In [None]:
event = 'S'
time_window_list = [1, 3, 7, 14, 30]
snapshot_date = feature_window_end_date

df_feature = frequency_feature_generation_time_window(df_feature_window, event, time_window_list, snapshot_date)
df_feature.show(5)

#### Generate frequency features for all event_list and time_window_list

In [None]:
event_list = ['P', 'S', 'D']
time_window_list = [1, 3, 7, 14, 30]
df_feature_list = []

for event in event_list:
    df_feature_list.append(frequency_feature_generation_time_window(df_feature_window, 
                                                                    event, 
                                                                    time_window_list, 
                                                                    snapshot_date))

In [None]:
df_feature_list

In [None]:
for feature_list in df_feature_list:
    feature_list.show(5)
    print('\n')

### Recency features

Find the last date of each user for each event. And count the number of days from last event date to 2017/4/28.

In [None]:
# defined as days from last event
# can generate one feature for each type of event

def days_from_last_event(df, event, snapshot_date):
    # 一行就解決，不好懂
#     df_days_from_last_event = df.filter(F.col('event') == event)\
#                                 .groupBy('uid')\
#                                 .agg(F.datediff(F.lit(snapshot_date), F.max('date')).alias('days_from_last_evnet'))
    # 拆成下面兩行
    # 先找出該 user 在 event 的最後一天
    df_days_from_last_event = df.filter(F.col('event') == event)\
                                .groupBy('uid').agg(F.max('date').alias('last_date'))
    # 再和 2017/4/28 比，看差幾天
    df_days_from_last_event = df_days_from_last_event.withColumn('days_from_last_' + event + '_evnet', 
                                                                 F.datediff(F.lit(snapshot_date), F.col('last_date')))\
                                                     .select('uid', 'days_from_last_' + event + '_evnet')
    return df_days_from_last_event

In [None]:
event_list = ['P', 'S', 'D']
snapshot_date = feature_window_end_date
df_days_list = []

for event in event_list:
    df_days_list.append(days_from_last_event(df_feature_window, event, snapshot_date))
    
for df_days in df_days_list:
    df_days.show()
    print('\n')

### Profile features

In [None]:
df_play = spark.read.csv('data/play.csv', header=True)

In [None]:
df_play.show()

In [None]:
df_play.select('uid').distinct().count()

There are 51635 users in the entire period.

In [None]:
df_play_feature_window = df_play.filter((F.col('date') >= feature_window_start_date) &
                                        (F.col('date') <= feature_window_end_date))
df_profile_tmp = df_play_feature_window.select('uid', 'device').distinct()

In [None]:
df_play_feature_window.select('uid').distinct().count()

There are 49,856 out ouf 51,635 users in the feature window.

In [None]:
df_profile_tmp.show(5)

In [None]:
df_profile_tmp.groupBy('device').count().show()

In [None]:
# check if one user has two devices
df_profile_tmp.count()

In [None]:
df_profile_tmp.distinct().count()

There are 49,856 distince users but there are 49,866 rows. So 10 users have two devices.

Now divide users in two groups, iPhone user and non-iPhone user
* iPhone user: `device_type=1`
* Non-iPhone user: `device_type=2`

In [None]:
df_profile_tmp = df_profile_tmp.withColumn('device_type', F.when(F.col('device') == 'ip', 1).otherwise(2))
df_profile_tmp.groupBy('device_type').count().show()

In [None]:
df_profile = df_label.select('uid').join(df_profile_tmp.select('uid', 'device_type'), on='uid', how='left')
df_profile.groupBy('device_type').count().show()

In [None]:
df_profile.show()

In [None]:
df_profile.count()

There are only 50,230 users in `df_label`. However, there are 10 users having two devices so there are 50,240 rows in `df_profile`.

### Total play time features

In [None]:
# Can you generate total song play time features (using play_ds data) for different time window
# using play data (need to clean play time first, play time may be negative in data)

# Convert play_time and strong_length from string to integer
df_play = df_play.withColumn('play_time', F.col('play_time').cast('integer'))
df_play = df_play.withColumn('song_length', F.col('song_length').cast('integer'))
df_play

In [None]:
df_play.show()

Only select those `play_time` > 0 and `song_length` > 0

In [None]:
df_play_new = df_play.filter((F.col('play_time') > 0) & (F.col('song_length') > 0))

In [None]:
df_play_new.summary().show()

In [None]:
# df_play_new.agg({'play_time':'max'}).collect()[0]

In [None]:
# df_play_new.agg({'play_time':'min'}).collect()[0]

In [None]:
# df_play_new.agg({'song_length':'max'}).collect()[0]

In [None]:
# df_play_new.agg({'song_length':'min'}).collect()[0]

Need to remove the outlier in `play_time` and `song_length`. The outliers are defined as > mean + 3$\sigma$.

In [None]:
# df_play_new.select(F.mean('play_time'), F.mean('song_length'), F.stddev('play_time'), F.stddev('song_length')).show()

`play_time_mean = df_play_new.agg({'play_time': 'mean'}).collect()[0]` returns `Row(avg(play_time)=1448.893665371415`.

However, I only want to get the value. So I need to add `asDict()` to convert row to dictionary. 
Then I can access the value using key.

`play_time_mean = df_play_new.agg({'play_time': 'mean'}).collect()[0].asDict()` returns `{'avg(play_time)': 1448.893665371415}`

In [None]:
play_time_mean = df_play_new.agg({'play_time': 'mean'}).collect()[0].asDict()
play_time_std = df_play_new.agg({'play_time': 'stddev'}).collect()[0].asDict()
song_length_mean = df_play_new.agg({'song_length': 'mean'}).collect()[0].asDict()
song_length_std = df_play_new.agg({'song_length': 'stddev'}).collect()[0].asDict()

In [None]:
# print(type(play_time_mean))
# print(play_time_mean, play_time_std, song_length_mean, song_length_std)

In [None]:
df_play_outliers_removed = df_play_new.filter((F.col('play_time') <= play_time_mean['avg(play_time)'] + 3 * play_time_std['stddev(play_time)']) &
                                              (F.col('song_length') <= song_length_mean['avg(song_length)'] + 3 * song_length_std['stddev(song_length)']))

In [None]:
df_play_outliers_removed.count()

In [None]:
df_play_outliers_removed.summary().show()

In [None]:
df_play_outliers_removed.show()

#### Generate total play time features for all event_list and time_window_list

In [None]:
def total_play_time_generation_time_window(df, time_window_list, snapshot_date):
    '''
    Generate frequency features for one event type and a list of time window
    '''
    df_feature = df.groupBy('uid')\
                   .agg(*[F.sum(F.when(
                                        (F.col('date') >= snapshot_date - datetime.timedelta(time_window - 1)) &
                                        (F.col('date') < snapshot_date)
                                   , F.col('play_time')).otherwise(0)\
                               ).alias('total_play_time_' + str(time_window)) \
                          for time_window in time_window_list])
                   
    return df_feature

In [None]:
time_window_list = [1, 3, 7, 14, 30]
snapshot_date = feature_window_end_date

df_total_play_time = total_play_time_generation_time_window(df_play_outliers_removed, time_window_list, snapshot_date)

In [None]:
df_total_play_time.show(5)

### Fancier frequency features

The units of `play_time` and `song_length` are in second. So $\frac{play\_time}{song\_length} > 0.8$ stands for the songs play 80% of their length. However, the type of `play_time` and `song_length` are integer. I need to use `play_time` > 0.8 * `song_length` instead.

In [None]:
# Can you generate counts of songs play 80% of their song length (using play_ds data) for different time window
# using play data (need to clean play time and song length first, play time may be negative in data, song length may be zeros)

In [None]:
df_play_eighty = df_play_outliers_removed.withColumn('eighty', F.when(F.col('play_time') > 0.8 * F.col('song_length'), 1).otherwise(0))
df_play_eighty = df_play_eighty.select('uid', 'date', 'eighty')

In [None]:
df_play_eighty.show()

Generate total `play_time` / `song_length` > 0.8 features for all event_list and time_window_list

In [None]:
def eighty_generation_time_window(df, time_window_list, snapshot_date):
    '''
    Generate frequency features for one event type and a list of time window
    '''
    df_feature = df.groupBy('uid')\
                   .agg(*[F.sum(F.when((F.col('date') >= snapshot_date - datetime.timedelta(time_window - 1)) &
                                       (F.col('date') < snapshot_date),
                                       F.when(F.col('play_time') > 0.8 * F.col('song_length'), 1).otherwise(0)\
                                      ).otherwise(0)\
                               ).alias('eighty_' + str(time_window)) \
                          for time_window in time_window_list])
    return df_feature

In [None]:
time_window_list = [1, 3, 7, 14, 30]
snapshot_date = feature_window_end_date

df_eighty = eighty_generation_time_window(df_play_outliers_removed, time_window_list, snapshot_date)

In [None]:
df_eighty.show()

## 5. Form training data

In [None]:
def join_feature_data(df_master, df_feature_list):
    for df_feature in df_feature_list:
        df_master = df_master.join(df_feature, on='uid', how='left')
        #df_master.persist() # uncomment if number of joins is too many
        
    return df_master

#### Join all behavior features

In [None]:
df_model_final = join_feature_data(df_label, df_feature_list)

#### Join all profile features

In [None]:
df_model_final = join_feature_data(df_model_final, [df_profile])

#### Join recency, total play time, and fancier frequency features

In [None]:
df_model_final = join_feature_data(df_model_final, df_days_list)

In [None]:
df_model_final = join_feature_data(df_model_final, [df_total_play_time, df_eighty])

In [None]:
# df_model_final.schema.names

In [None]:
# df_model_final.printSchema()

In [None]:
df_model_final.fillna(0).toPandas().to_csv('data/model_final.csv', index=False)