In [23]:
import os
import pandas as pd
import datetime

In [2]:
#load data
cwd = os.getcwd()
whole_file = '/label.csv'
label = pd.read_csv(cwd + whole_file, encoding='utf-8', header = 0)

In [24]:
# test whether it has been balanced 
print(label[label.target == 0.0].uid.count())
print(label[label.target == 1.0].uid.count())

199799
199799


In [26]:
label.describe()

Unnamed: 0,uid,target
count,399598.0,399598.0
mean,168172800.0,0.5
std,2841814.0,0.500001
min,100071800.0,0.0
25%,168021500.0,0.0
50%,168458000.0,0.5
75%,168764400.0,1.0
max,169262300.0,1.0


In [3]:
whole_file = '/all_clean_data.csv'
all_data = pd.read_csv(cwd + whole_file, encoding='utf-8', header = 0)

In [4]:
all_data.describe()

Unnamed: 0,uid,play_time,song_length,song_id
count,68126320.0,68126320.0,68126320.0,68126320.0
mean,168024000.0,177.3799,265.922,203315400000000.0
std,3407224.0,187.675,7415.941,4.643687e+16
min,100071800.0,0.000146052,1.0,257.0
25%,167962000.0,53.0,207.0,928493.0
50%,168340400.0,197.0,244.0,4841591.0
75%,168700600.0,253.0,280.0,7145831.0
max,169262300.0,21280.0,61166590.0,1.844674e+19


In [5]:
all_data['date'] = pd.to_datetime(all_data['date'])

In [6]:
all_data['play time percentage of song length'] = all_data['play_time'] / all_data['song_length']

In [7]:
#(snapshot_date - all_data['date']).astype('timedelta64[h]')/24.0
#all_data

## Generate Features 

### 1. Frequency 

In [8]:
feature_list = []
snapshot_date = pd.to_datetime('2017-04-29')
window_list = [1,3,7,14,30]
#date_N_days_ago = snapshot_date - datetime.timedelta(days=N[0])
#(snapshot_date - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
#print(date_N_days_ago)

def compute_freq(df, snapshot_date, window):
    start_date = (snapshot_date - datetime.timedelta(days=window)).strftime('%Y-%m-%d')
    end_date = snapshot_date.strftime('%Y-%m-%d')
    df_filtered = df.loc[(df['date'] >= start_date) & (df['date'] < end_date),:]
    feature = df_filtered.groupby('uid', as_index = False)['date'].count()
    feature = feature.rename(columns = {'date':'last ' + str(window) +' days frequency on play log'})
    print(feature.head(5))
    return feature
for window in window_list:
    feature = compute_freq(all_data, snapshot_date, window)
    feature_list.append(feature)

           uid  last 1 days frequency on play log
0  100549339.0                                  2
1  100722761.0                                  7
2  101206434.0                                  8
3  101231687.0                                 56
4  101481979.0                                120
           uid  last 3 days frequency on play log
0  100245413.0                                  6
1  100415077.0                                 36
2  100474444.0                                 33
3  100549339.0                                 75
4  100596698.0                                 37
           uid  last 7 days frequency on play log
0  100202712.0                                  9
1  100245413.0                                  6
2  100415077.0                                 36
3  100474444.0                                 72
4  100549339.0                                145
           uid  last 14 days frequency on play log
0  100087237.0                                   

### 2. Recency

In [9]:
def compute_recency(df, snapshot_date):
    df_filtered = df.loc[(df['date'] < snapshot_date.strftime('%Y-%m-%d')),:][['uid', 'date']]
    feature = df_filtered.groupby('uid', as_index = False).max()
    feature = feature.rename(columns = {'date':'recency date on play log'})
    feature['recency on play log'] = (snapshot_date - feature['recency date on play log']).astype('timedelta64[h]')/24.0
    feature = feature[['uid', 'recency on play log']]
    print(feature.head(5))
    return feature

In [10]:
feature = compute_recency(all_data, snapshot_date)
feature_list.append(feature)

           uid  recency on play log
0  100071797.0                 30.0
1  100087237.0                 14.0
2  100139083.0                 19.0
3  100157378.0                 11.0
4  100202448.0                 30.0


### 3. Play time percentage of song length 

In [11]:
def compute_play_time_percentage(df, snapshot_date, window):
    start_date = (snapshot_date - datetime.timedelta(days=window)).strftime('%Y-%m-%d')
    end_date = snapshot_date.strftime('%Y-%m-%d')
    df_filtered = df.loc[(df['date'] >= start_date) & (df['date'] < end_date),:]
    feature = df_filtered.groupby('uid', as_index = False)['play time percentage of song length'].mean()
    feature = feature.rename(columns = {'play time percentage of song length':'last ' + str(window) +' days play time percentage of song length'})
    print(feature.head(5))
    return feature

In [12]:
for window in window_list:
    feature = compute_play_time_percentage(all_data, snapshot_date, window)
    feature_list.append(feature)

           uid  last 1 days play time percentage of song length
0  100549339.0                                         0.536364
1  100722761.0                                         0.974304
2  101206434.0                                         0.774230
3  101231687.0                                         0.797692
4  101481979.0                                         0.986748
           uid  last 3 days play time percentage of song length
0  100245413.0                                         0.460945
1  100415077.0                                         0.998473
2  100474444.0                                         0.648643
3  100549339.0                                         0.602365
4  100596698.0                                         0.669612
           uid  last 7 days play time percentage of song length
0  100202712.0                                         0.998084
1  100245413.0                                         0.460945
2  100415077.0                          

## Combine all the features

In [27]:
churn = label

In [28]:
print(churn[churn.target == 0.0].uid.count())
print(churn[churn.target == 1.0].uid.count())

199799
199799


In [29]:
for feature in feature_list:
    churn = pd.merge(churn, feature, on = 'uid', how = 'left')

In [30]:
churn

Unnamed: 0,uid,target,last 1 days frequency on play log,last 3 days frequency on play log,last 7 days frequency on play log,last 14 days frequency on play log,last 30 days frequency on play log,recency on play log,last 1 days play time percentage of song length,last 3 days play time percentage of song length,last 7 days play time percentage of song length,last 14 days play time percentage of song length,last 30 days play time percentage of song length
0,167772160.0,0.0,4.0,27.0,28.0,39.0,137,1.0,0.473935,0.760951,0.767341,0.807052,0.816434
1,167772162.0,0.0,,3.0,3.0,7.0,48,2.0,,0.819936,0.819936,0.822981,0.659536
2,167772163.0,0.0,,,,11.0,11,8.0,,,,0.452746,0.452746
3,168296454.0,0.0,,,1.0,1.0,25,7.0,,,0.570796,0.570796,0.283139
4,167772170.0,0.0,,,,18.0,58,9.0,,,,0.868650,0.854653
5,168296459.0,0.0,,,,3.0,3,13.0,,,,0.482646,0.482646
6,167772174.0,0.0,19.0,108.0,146.0,175.0,189,1.0,0.689460,0.640444,0.583213,0.554027,0.546446
7,167772175.0,0.0,,,,7.0,8,8.0,,,,0.243859,0.338377
8,168296464.0,0.0,4.0,55.0,135.0,301.0,512,1.0,0.505666,0.464087,0.773845,0.803577,0.854933
9,167772176.0,0.0,5.0,44.0,51.0,119.0,406,1.0,0.946374,0.802178,0.828527,0.881447,0.927680


In [31]:
churn = churn.fillna(0.0)

In [32]:
churn

Unnamed: 0,uid,target,last 1 days frequency on play log,last 3 days frequency on play log,last 7 days frequency on play log,last 14 days frequency on play log,last 30 days frequency on play log,recency on play log,last 1 days play time percentage of song length,last 3 days play time percentage of song length,last 7 days play time percentage of song length,last 14 days play time percentage of song length,last 30 days play time percentage of song length
0,167772160.0,0.0,4.0,27.0,28.0,39.0,137,1.0,0.473935,0.760951,0.767341,0.807052,0.816434
1,167772162.0,0.0,0.0,3.0,3.0,7.0,48,2.0,0.000000,0.819936,0.819936,0.822981,0.659536
2,167772163.0,0.0,0.0,0.0,0.0,11.0,11,8.0,0.000000,0.000000,0.000000,0.452746,0.452746
3,168296454.0,0.0,0.0,0.0,1.0,1.0,25,7.0,0.000000,0.000000,0.570796,0.570796,0.283139
4,167772170.0,0.0,0.0,0.0,0.0,18.0,58,9.0,0.000000,0.000000,0.000000,0.868650,0.854653
5,168296459.0,0.0,0.0,0.0,0.0,3.0,3,13.0,0.000000,0.000000,0.000000,0.482646,0.482646
6,167772174.0,0.0,19.0,108.0,146.0,175.0,189,1.0,0.689460,0.640444,0.583213,0.554027,0.546446
7,167772175.0,0.0,0.0,0.0,0.0,7.0,8,8.0,0.000000,0.000000,0.000000,0.243859,0.338377
8,168296464.0,0.0,4.0,55.0,135.0,301.0,512,1.0,0.505666,0.464087,0.773845,0.803577,0.854933
9,167772176.0,0.0,5.0,44.0,51.0,119.0,406,1.0,0.946374,0.802178,0.828527,0.881447,0.927680


In [33]:
churn.describe()

Unnamed: 0,uid,target,last 1 days frequency on play log,last 3 days frequency on play log,last 7 days frequency on play log,last 14 days frequency on play log,last 30 days frequency on play log,recency on play log,last 1 days play time percentage of song length,last 3 days play time percentage of song length,last 7 days play time percentage of song length,last 14 days play time percentage of song length,last 30 days play time percentage of song length
count,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0,399598.0
mean,168172800.0,0.5,3.594845,11.142431,24.344814,51.633737,130.165359,14.219681,0.130197,0.211607,0.286117,0.362373,0.625767
std,2841814.0,0.500001,16.121792,39.603312,77.491364,145.367558,293.423647,11.510612,0.293819,0.344733,0.366814,0.369368,0.240023
min,100071800.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.00016
25%,168021500.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,0.0,0.0,0.0,0.0,0.463027
50%,168458000.0,0.5,0.0,0.0,0.0,2.0,36.0,12.0,0.0,0.0,0.0,0.315091,0.650924
75%,168764400.0,1.0,0.0,4.0,16.0,43.0,135.0,27.0,0.0,0.463005,0.651066,0.721854,0.819095
max,169262300.0,1.0,3804.0,8204.0,17086.0,26141.0,50096.0,30.0,1.0,1.0,1.0,1.0,1.0


In [20]:
parent_dir = os.path.dirname(os.getcwd())

In [22]:
# save feature file
whole_file = '/Modeling/churn.csv'
churn.to_csv(parent_dir + whole_file, encoding='utf-8', index=False)