## Data Preparation: Churn labeling and Downsampling

#### 1. Read the *play.log files line by line, and write only the user ID, device and date of log into a separate file.

#### 2. Label churn users: those who played more than three times before the cutoff day but had no acitivity after the cutoff.

#### 3. Down sampling is necessary. There are more than 50,000 users and 15 GB log data, which is not necessary for the churn prediction and which costs too much time to process. I used  a down sampling ratio of 1/10, to only include 1/10 users from the active and churn users for the churn prediction model.

### 1. Churn labeling


In [1]:
import glob


In [2]:
filepath = '/Volumes/SHARE/Bittiger/Capstone/data/play/*play.log'
files = glob.glob(filepath)
# amount of files
len(files)

138

In [7]:
# take a look at one of the files
files[0]

'/Volumes/SHARE/Bittiger/Capstone/data/play/20170302_1_play.log'

In [8]:
# get an idea how many lines are in one .log file
with open(files[0],'r') as f:
    lines = f.readlines()
    log_lines = len(lines)
log_lines

2452954

In [13]:
# Check one line
lines[3]

'154431666 \tar \t6651913 \t0 \t\xe5\x86\x8d\xe8\xa7\x81\xe5\x8f\xaa\xe6\x98\xaf\xe9\x99\x8c\xe7\x94\x9f\xe4\xba\xba \t\xe5\xba\x84\xe5\xbf\x83\xe5\xa6\x8d \t252 \t251 \t0\n'

In [15]:
test_list = lines[3].strip('\n').split('\t')
test_list.append(files[0].split('/')[-1])
test_list

['154431666 ',
 'ar ',
 '6651913 ',
 '0 ',
 '\xe5\x86\x8d\xe8\xa7\x81\xe5\x8f\xaa\xe6\x98\xaf\xe9\x99\x8c\xe7\x94\x9f\xe4\xba\xba ',
 '\xe5\xba\x84\xe5\xbf\x83\xe5\xa6\x8d ',
 '252 ',
 '251 ',
 '0',
 '20170302_1_play.log']

In [16]:
import pandas as pd

schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name']
df = pd.read_csv(files[0], delimiter='\t',header=None,index_col=None,names=schema)
df.head()
# Note the file_name will be added later

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,uid,device,song_id,song_type,song_name,singer,play_time,song_length,paid_flag,file_name
0,264715.0,,,,,,,,,
1,292363.0,,,,,,,,,
2,154825000.0,ar,3300104.0,0.0,Not A Love Song,Bean,22.0,178.0,0.0,
3,154432000.0,ar,6651913.0,0.0,再见只是陌生人,庄心妍,252.0,251.0,0.0,
4,154439000.0,ar,20870987.0,0.0,哥哥,张杰,358.0,358.0,0.0,


### Save reduced play logs to two log files.
Only the first two items of each line, user id and device, and the date of the log are saved, so it's called reduced play logs.

In [18]:
# 04/22 is the cutoff date for labeling churns
cutoff = '20170422'

In [2]:
# destination file names to save the reduced logs.
first_period_log = '/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/play_till_cutoff.log'
second_period_log = '/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/play_after_cutoff.log'

In [25]:
import time

for each_file in files:
    current_time = time.clock()

    with open(each_file, 'r') as f:
        lines = f.readlines()
        filename = f.name.split('/')[-1]
        print('processing file: %s' % filename)
        #choose the output path
        if filename < cutoff:
            output_path = first_period_log
        else:
            output_path = second_period_log
        # write to the output file
        with open(output_path, 'a') as output:
            for line in lines:
                fields_to_keep = line.strip('\n').split('\t')[:2]
                fields_to_keep.append(filename)
                output.write('\t'.join(fields_to_keep)+'\n')
    print('...costs %.2f seconds' % (time.clock()-current_time))

processing file: 20170302_1_play.log
...costs 9.31 seconds
processing file: 20170303_1_play.log
...costs 5.91 seconds
processing file: 20170304_1_play.log
...costs 5.31 seconds
processing file: 20170305_1_play.log
...costs 4.67 seconds
processing file: 20170306_1_play.log
...costs 3.98 seconds
processing file: 20170307_1_play.log
...costs 3.81 seconds
processing file: 20170308_1_play.log
...costs 3.80 seconds
processing file: 20170309_1_play.log
...costs 3.52 seconds
processing file: 20170330_3_play.log
...costs 14.03 seconds
processing file: 20170331_1_play.log
...costs 5.85 seconds
processing file: 20170331_2_play.log
...costs 5.98 seconds
processing file: 20170331_3_play.log
...costs 8.49 seconds
processing file: 20170339_1_play.log
...costs 6.55 seconds
processing file: 20170401_1_play.log
...costs 4.85 seconds
processing file: 20170401_2_play.log
...costs 5.56 seconds
processing file: 20170401_3_play.log
...costs 6.17 seconds
processing file: 20170402_1_play.log
...costs 5.00 seco

In [3]:
import pandas as pd

schema = ['uid','device','file_name']
df_1 = pd.read_csv(first_period_log,delimiter='\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})
df_1.head()

Unnamed: 0,uid,device,file_name
0,264715,20170302_1_play.log,
1,292363,20170302_1_play.log,
2,154824972,ar,20170302_1_play.log
3,154431666,ar,20170302_1_play.log
4,154439430,ar,20170302_1_play.log


In [80]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107151848 entries, 0 to 107151847
Data columns (total 3 columns):
uid          object
device       object
file_name    object
dtypes: object(3)
memory usage: 2.4+ GB


In [35]:
# change file_name to date
def get_date(file_name):
    tmp_list = str(file_name).split('_')
    return tmp_list[0]
df_1['date'] = df_1['file_name'].map(get_date)

In [39]:
df_1 = df_1.drop(['file_name'], axis = 1)

### Explore the data

In [81]:
len(df_1['uid'].unique())

847330

In [7]:
df_1['device'].unique()

array(['20170302_1_play.log', 'ar ', 'ip ', '20170303_1_play.log', 'ar',
       'ip', 'mc', 'wp', '20170339_1_play.log', '168589573', nan,
       '20170301_play.log'], dtype=object)

In [8]:
len(df_1['file_name'].unique())

76

In [4]:
df_1.uid.value_counts()

1685126       3926060
37025504      2845076
751824        2265356
1791497       1995972
497685        1506558
1062806       1258860
736305         943003
1685126        628327
0              605211
37025504       460375
1749320        402496
1679121        261558
46532274       252227
28638487       211480
637650         116820
26036032       114823
155948236      106490
533817          82783
32166203        77942
1749320         72429
64268006        70534
398309          56301
28638487        53793
1883192         42583
0               40719
736305          39938
154539052       37003
168127634       35009
22730453        34422
16517426        34282
               ...   
168735668           1
154524886           1
168825011           1
167991919           1
168590429           1
154673690           1
154702268           1
168661192           1
169012773           1
168652788           1
168925553           1
168640371           1
158236195           1
168819795           1
168708431 

Looks like uid = 0 should be testing id. And those uid with log numbers larger than that of uid = 0 may be robot. Check the device type of these ids. These user id will be deleted later.


### Criteria of active user: number of activities before cutoff date >= 3
### Criteria of churn user: active users that have no activity after cutoff date
### Criteria of loyal user: a user has>= 3 activities before cutoff date and has recent activity after cutoff date

In [5]:
# total number of active users and inactive users before the cutoff date
active = df_1.uid.value_counts()>=3
sum(active),sum(active==0)

(735197, 112132)

In [6]:
active_users = [active.index[i] for i in xrange(len(active)) if active[i]]

In [7]:
len(active_users)

735197

In [8]:
active_set = set(active_users)

In [9]:
len(active_set)

735197

In [10]:
# Now process the recent play.log file to get recent users.
df_2 = pd.read_csv(second_period_log,delimiter='\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})
df_2.head()

Unnamed: 0,uid,device,file_name
0,751824,ar,20170422_1_play.log
1,167806444,ar,20170422_1_play.log
2,28638487,ar,20170422_1_play.log
3,167854540,ar,20170422_1_play.log
4,1685126,ar,20170422_1_play.log


In [11]:
df_2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57469042 entries, 0 to 57469041
Data columns (total 3 columns):
uid          object
device       object
file_name    object
dtypes: object(3)
memory usage: 1.3+ GB


In [12]:
active_recent = df_2.uid.value_counts()

In [13]:
len(active_recent) 

273222

In [14]:
type(active_recent[0])

numpy.int64

In [15]:
active_set_recent = set(active_recent.index)
len(active_set_recent)

273222

In [16]:
# Churn user set:
churn_set = active_set - active_set_recent
len(churn_set)

495972

In [17]:
# Loyal user set:
loyal_set = active_set & active_set_recent
len(loyal_set)

239225

### Down sample and save reduced dataframe

In [18]:
import random

random.seed(42)

In [19]:
loyal_sample = random.sample(loyal_set,len(loyal_set)/10)
len(loyal_sample)

23922

In [20]:
churn_sample = random.sample(churn_set,len(churn_set)/10)
len(churn_sample)

49597

In [30]:
churn_sample_list = list(churn_sample)

In [31]:
loyal_sample_list = list(loyal_sample)

In [32]:
outfile = open("/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/churn_sample_list.pkl","w") 

In [35]:
import numpy as np
np.save("/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/churn_sample_list",churn_sample_list)

In [39]:
np.save("/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/loyal_sample_list",loyal_sample_list)

In [97]:
df_churn = df_1.loc[df_1.uid.isin(churn_sample),:]

KeyboardInterrupt: 

In [75]:
df_churn.shape

(3289903, 3)

In [76]:
df_loyal_log = df_1.loc[df_1.uid.isin(loyal_sample),:]

In [78]:
df_loyal_log.shape

(11388612, 3)

In [80]:
df_churn.to_csv('/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/churn_df_sample.csv',sep='\t', encoding='utf-8')
df_loyal_log.to_csv('/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/loyal_df_sample.csv',sep='\t', encoding='utf-8')

In [22]:
import glob

In [23]:
filepath = '/Volumes/SHARE/Bittiger/Capstone/data/play/*play.log'
files = glob.glob(filepath)
# amount of files
len(files)

138

In [None]:
schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name','label']

In [28]:
output = open('/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/user_sample_play.log','a')

In [29]:
import time


for the_file in files:
    current_time = time.clock()

    with open(the_file, 'r') as f:
        lines = f.readlines()
        file_name = f.name.split('/')[-1]
        print('processing file: %s' % file_name)
        for line in lines:
            user_id = line.strip('\n').split('\t')[0]
            if user_id in churn_sample:
                contents_to_wirte = line.strip('\n').split('\t')
                contents_to_wirte.extend((file_name, '1'))
            elif user_id in loyal_sample:
                contents_to_wirte = line.strip('\n').split('\t')
                contents_to_wirte.extend((file_name, '0'))
            else:
                continue  
            output.write('\t'.join(contents_to_wirte)+'\n')
    print('...costs %.2f seconds' % (time.clock()-current_time))

processing file: 20170302_1_play.log


KeyboardInterrupt: 

In [None]:
output.close()

In [None]:
df_play = pd.read_csv('/Users/Xiaoxi/Desktop/BitTiger/Capstone/data/output/user_sample_play.log',
                 delimiter='\t',header=None,index_col=None,names = schema,
                 dtype = {'uid':'str', 'song_id':'str','song_type' : 'str'})
df_play.head()

In [None]:
df_play.info()