## Data Preparation: Churn labeling and Downsampling

#### 1. Read the *play.log files line by line, and write only the user ID, device and date of log into a separate file.

#### 2. Label churn users: those who played more than three times before the cutoff day but had no acitivity after the cutoff.

#### 3. Down sampling is necessary. There are more than 50,000 users and 15 GB log data, which is not necessary for the churn prediction and which costs too much time to process. I used  a down sampling ratio of 1/10, to only include 1/10 users from the active and churn users for the churn prediction model.

### 1. Churn labeling


In [1]:
import glob


In [2]:
filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'
files = glob.glob(filepath)
# amount of files
len(files)

138

In [3]:
# take a look at one of the files
files[0]

'/Users/ZhijingYe/Desktop/data/play/20170410_2_play.log'

In [4]:
# get an idea how many lines are in one .log file
with open(files[0],'r') as f:
    lines = f.readlines()
    log_lines = len(lines)
log_lines

1149628

In [5]:
# Check one line
lines[3]

'168071768\tar\t1248464\t0\t\xe6\x88\x90\xe7\x8e\x8b\xe8\xb4\xa5\xe5\xaf\x87\t\xe9\x99\x88\xe5\xb0\x8f\xe6\x98\xa5\t187\t187\t0\n'

In [6]:
test_list = lines[3].strip('\n').split('\t')
test_list.append(files[0].split('/')[-1])
test_list

['168071768',
 'ar',
 '1248464',
 '0',
 '\xe6\x88\x90\xe7\x8e\x8b\xe8\xb4\xa5\xe5\xaf\x87',
 '\xe9\x99\x88\xe5\xb0\x8f\xe6\x98\xa5',
 '187',
 '187',
 '0',
 '20170410_2_play.log']

In [7]:
import pandas as pd
import numpy as np

schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name']
df = pd.read_csv(files[0], sep='\t',header=None,index_col=None,names=schema )
df.head()
# Note the file_name will be added later

  data = self._reader.read(nrows)


Unnamed: 0,uid,device,song_id,song_type,song_name,singer,play_time,song_length,paid_flag,file_name
0,168335200.0,ar,6429024,0,相对湿度,郑希怡,238,238,0,
1,168310500.0,ar,3348254,0,曾经心痛,袁娅维,21,312,0,
2,168308200.0,ar,5436214,0,Dream A Little Dream,Robbie Williams,246,247,0,
3,168071800.0,ar,1248464,0,成王败寇,陈小春,187,187,0,
4,168480800.0,ar,317412,0,Kissy Kissy,Smile.DK,188,189,0,


### Save reduced play logs to two log files.
Only the first two items of each line, user id and device, and the date of the log are saved, so it's called reduced play logs.

In [8]:
# 04/22 is the cutoff date for labeling churns
cutoff = '20170422'

In [9]:
# destination file names to save the reduced logs.
first_period_log = '/Users/ZhijingYe/Desktop/data/output/play_till_cutoff.log'
second_period_log = '/Users/ZhijingYe/Desktop/data/output/play_after_cutoff.log'

In [10]:
import time

for each_file in files:
    current_time = time.clock()

    with open(each_file, 'r') as f:
        lines = f.readlines()
        filename = f.name.split('/')[-1]
        print('processing file: %s' % filename)
        #choose the output path
        if filename < cutoff:
            output_path = first_period_log
        else:
            output_path = second_period_log
        # write to the output file
        with open(output_path, 'a') as output:
            for line in lines:
                fields_to_keep = line.strip('\n').split('\t')[:2]
                fields_to_keep.append(filename)
                output.write('\t'.join(fields_to_keep)+'\n')
    print('...costs %.2f seconds' % (time.clock()-current_time))

processing file: 20170410_2_play.log
...costs 3.02 seconds
processing file: 20170410_3_play.log
...costs 2.89 seconds
processing file: 20170427_3_play.log
...costs 2.90 seconds
processing file: 20170427_2_play.log
...costs 2.27 seconds
processing file: 20170504_3_play.log
...costs 2.05 seconds
processing file: 20170504_2_play.log
...costs 1.93 seconds
processing file: 20170508_1_play.log
...costs 2.01 seconds
processing file: 20170505_1_play.log
...costs 2.07 seconds
processing file: 20170411_1_play.log
...costs 2.74 seconds
processing file: 20170426_1_play.log
...costs 2.23 seconds
processing file: 20170509_3_play.log
...costs 1.81 seconds
processing file: 20170509_2_play.log
...costs 1.93 seconds
processing file: 20170401_2_play.log
...costs 4.37 seconds
processing file: 20170401_3_play.log
...costs 5.01 seconds
processing file: 20170423_1_play.log
...costs 2.54 seconds
processing file: 20170414_1_play.log
...costs 1.88 seconds
processing file: 20170418_2_play.log
...costs 2.39 secon

In [11]:
import pandas as pd

schema = ['uid','device','file_name']
df_1 = pd.read_csv(first_period_log,delimiter='\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})
df_1.head()

Unnamed: 0,uid,device,file_name
0,168335198,ar,20170410_2_play.log
1,168310452,ar,20170410_2_play.log
2,168308159,ar,20170410_2_play.log
3,168071768,ar,20170410_2_play.log
4,168480816,ar,20170410_2_play.log


In [12]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 321455544 entries, 0 to 321455543
Data columns (total 3 columns):
uid          object
device       object
file_name    object
dtypes: object(3)
memory usage: 9.6+ GB


In [13]:
# change file_name to date
def get_date(file_name):
    tmp_list = str(file_name).split('_')
    return tmp_list[0]
df_1['date'] = df_1['file_name'].map(get_date)

In [14]:
df_1 = df_1.drop(['file_name'], axis = 1)

### Explore the data

In [18]:
len(df_1['uid'].unique())

847330

In [19]:
df_1['device'].unique()

array(['ar', 'ip', 'mc', 'wp', 'ar ', 'ip ', '20170302_1_play.log',
       '168589573', '20170301_play.log', nan, '20170303_1_play.log',
       '20170339_1_play.log'], dtype=object)

In [21]:
#  len(df_1['file_name'].unique())

In [22]:
df_1.uid.value_counts()

1685126      11778180
37025504      8535228
751824        6796068
1791497       5987916
497685        4519674
1062806       3776580
736305        2829009
1685126       1884981
0             1815633
37025504      1381125
1749320       1207488
1679121        784674
46532274       756681
28638487       634440
637650         350460
...
167679654     3
168963526     3
168327556     3
154699061     3
168963528     3
168761341     3
154652167     3
168686496     3
154828622     3
154494259     3
154502301     3
168280933     3
154426629     3
167932419     3
168891406     3
Length: 847329, dtype: int64

Looks like uid = 0 should be testing id. And those uid with log numbers larger than that of uid = 0 may be robot. Check the device type of these ids. These user id will be deleted later.


### Criteria of active user: number of activities before cutoff date >= 3
### Criteria of churn user: active users that have no activity after cutoff date
### Criteria of loyal user: a user has>= 3 activities before cutoff date and has recent activity after cutoff date

In [23]:
# total number of active users and inactive users before the cutoff date
active = df_1.uid.value_counts()>=3
sum(active),sum(active==0)

(847329, 0)

In [24]:
active_users = [active.index[i] for i in xrange(len(active)) if active[i]]

In [25]:
len(active_users)

847329

In [26]:
active_set = set(active_users)

In [27]:
len(active_set)

847329

In [28]:
# Now process the recent play.log file to get recent users.
df_2 = pd.read_csv(second_period_log,delimiter='\t',header=None,index_col=None,names=schema, dtype = {'uid':'str'})
df_2.head()

Unnamed: 0,uid,device,file_name
0,169026646,ar,20170427_3_play.log
1,168553991,ar,20170427_3_play.log
2,1685126,ar,20170427_3_play.log
3,168845172,ar,20170427_3_play.log
4,168538454,ar,20170427_3_play.log


In [29]:
df_2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 172407126 entries, 0 to 172407125
Data columns (total 3 columns):
uid          object
device       object
file_name    object
dtypes: object(3)
memory usage: 5.1+ GB


In [30]:
active_recent = df_2.uid.value_counts()

In [31]:
len(active_recent) 

273222

In [32]:
type(active_recent[0])

numpy.int64

In [33]:
active_set_recent = set(active_recent.index)
len(active_set_recent)

273222

In [34]:
# Churn user set:
churn_set = active_set - active_set_recent
len(churn_set)

598465

In [35]:
# Loyal user set:
loyal_set = active_set & active_set_recent
len(loyal_set)

248864

### Down sample and save reduced dataframe

In [36]:
import random

random.seed(42)

In [37]:
loyal_sample = random.sample(loyal_set,len(loyal_set)/10)
len(loyal_sample)

24886

In [38]:
churn_sample = random.sample(churn_set,len(churn_set)/10)
len(churn_sample)

59846

In [39]:
churn_sample_list = list(churn_sample)

In [40]:
loyal_sample_list = list(loyal_sample)

In [41]:
outfile = open("/Users/ZhijingYe/Desktop/data/output/churn_sample_list.pkl","w") 

In [42]:
import numpy as np
np.save("/Users/ZhijingYe/Desktop/data/output/churn_sample_list",churn_sample_list)

In [43]:
np.save("/Users/ZhijingYe/Desktop/data/output/loyal_sample_list",loyal_sample_list)

In [44]:
df_churn = df_1.loc[df_1.uid.isin(churn_sample),:]

In [45]:
df_churn.shape

(9814302, 3)

In [46]:
df_loyal_log = df_1.loc[df_1.uid.isin(loyal_sample),:]

In [47]:
df_loyal_log.shape

(35070042, 3)

In [48]:
df_churn.to_csv('/Users/ZhijingYe/Desktop/data/output/churn_df_sample.csv',sep='\t', encoding='utf-8')
df_loyal_log.to_csv('/Users/ZhijingYe/Desktop/data/output/loyal_df_sample.csv',sep='\t', encoding='utf-8')

In [49]:
import glob

In [50]:
filepath = '/Users/ZhijingYe/Desktop/data/play/*play.log'
files = glob.glob(filepath)
# amount of files
len(files)

138

In [51]:
schema = ['uid','device','song_id','song_type','song_name','singer','play_time','song_length','paid_flag','file_name','label']

In [52]:
output = open('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log','a')

In [None]:
import time


for the_file in files:
    current_time = time.clock()

    with open(the_file, 'r') as f:
        lines = f.readlines()
        file_name = f.name.split('/')[-1]
        print('processing file: %s' % file_name)
        for line in lines:
            user_id = line.strip('\n').split('\t')[0]
            if user_id in churn_sample:
                contents_to_wirte = line.strip('\n').split('\t')
                contents_to_wirte.extend((file_name, '1'))
            elif user_id in loyal_sample:
                contents_to_wirte = line.strip('\n').split('\t')
                contents_to_wirte.extend((file_name, '0'))
            else:
                continue  
            output.write('\t'.join(contents_to_wirte)+'\n')
    print('...costs %.2f seconds' % (time.clock()-current_time))

processing file: 20170410_2_play.log


In [None]:
output.close()

In [None]:
df_play = pd.read_csv('/Users/ZhijingYe/Desktop/data/output/user_sample_play.log',
                 delimiter='\t',header=None,index_col=None,names = schema,
                 dtype = {'uid':'str', 'song_id':'str','song_type' : 'str'})
df_play.head()

In [None]:
df_play.info()