# Music Box Churn Prediction and Recommendation using Spark

# Data mining

The data mining includes
1. download data which are gziped tarballs
2. unzip tarballs to get log files
3. combine log files into single log file


## 1. Create directories

In [1]:
import os

path = os.getcwd()
# print(path)
data_dir = path + '/data/'
if os.path.isdir(data_dir) == False:
    os.mkdir(data_dir)

for subdir in ['play', 'down', 'search']:
#     print(data_dir + subdir)
    if os.path.isdir(data_dir + subdir) == False:
        os.mkdir(data_dir + subdir)

## 2. Download data

* Download data from [https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/list.html](https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/list.html).
* However, this page is generated by JavaScript, so I cannot use `BeautifulSoup(r.text, 'html.parser')`.
  I need to go to [https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com](https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com) and use `BeautifulSoup(r.text, 'lxml')`
* If I use `list.remove(i)` in the for loop, the i-th element will be removed and (i+1)-th element will move to i-th place and for loop will run the next element which is (i+2)-th element in the original list. So the (i+1)-th element will be skipped. And this is not what I want to see.
  * Create an empty list and add the element which passes the selection conditions to the list.
* `20170422_3_play.log (1).tar.gz` causes problem because there is space in the filename.
  * Use `urllib.parse.unquote()` can keep the space.
  * Still have problem, skip this file and download it manually.

In [2]:
import requests
from bs4 import BeautifulSoup
import re # regular expension
import urllib

# url = 'https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/list.html'
url = 'https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com'

def download_files(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
#     print(soup.prettify())
#     soup.find_all('a')
#     gz_files = soup.find_all(text = re.compile(".gz"))
#     print(gz_files)

    gz_files = []
    for gz in soup.find_all(text = re.compile(".gz")):
        if ('search' in gz or 'play' in gz or 'down' in gz) and gz.split('_')[0] >= '20170330':
            gz_files.append(gz)
#     print(gz_files)

    for gz in gz_files:
        subdir = gz.split('.')[0].split('_')[-1]
#         print(subdir)
        download_file = url + '/' + gz
        local_file = data_dir + '/' + subdir + '/' + gz
        print(download_file)
        if gz == '20170422_3_play.log (1).tar.gz':
            print('%s need to be download manually.' % gz)
            continue
        if os.path.isdir(data_dir + '/' + subdir):
            if os.path.isfile(local_file):
                print('File already exists')
            else:
                print('Download %s' % gz)
                urllib.request.urlretrieve(urllib.parse.unquote(download_file), local_file)

In [3]:
download_files(url)

https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_1_down.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_1_search.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_2_down.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_2_search.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_3_down.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_3_play.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170330_3_search.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170331_1_down.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-west-2.amazonaws.com/20170331_1_play.log.tar.gz
File already exists
https://bittigermusicplayerdata.s3-us-we

## 3. Unzip downloaded files

* See [tarfile](https://docs.python.org/3/library/tarfile.html) module.
  * `'r:gz'`: Open for reading with gzip compression.
* Skip `20170424_1_play.log.tar.gz` and `20170424_2_play.log.tar.gz` because it is problematic file
  * I tried to download these two files again and used command line to untar and unzip it but it didn't work.

In [4]:
import tarfile
# import shutil

def untar_file(tarball_path, tar_to_path):
    with tarfile.open(tarball_path, 'r:gz') as tar_obj:
        tar_obj.extractall(tar_to_path)

In [5]:
for subdir in ['play', 'down', 'search']:
    current_dir = data_dir + subdir
    untar_dir = current_dir + '/untar'
    if os.path.isdir(untar_dir) == False:
        os.mkdir(untar_dir)
#     else:
#         shutil.rmtree(untar_dir) # delete directory
    files = os.listdir(current_dir)
    
#     print(files)
#     print(current_dir)

    for tar in files:
        if tar.endswith('.tar.gz'):
            # Use `with` then TarFile object will be closed automatically
            print('Extract %s' % tar)
            output = tar.replace('.tar.gz', '')
#             print(output)
            if tar in ['20170424_1_play.log.tar.gz', '20170424_2_play.log.tar.gz']:
                print('%s has problem, skip it' % tar)
                continue
            untar_file(current_dir + '/' + tar, untar_dir)
            for log in os.listdir(untar_dir):
#                 print(log)
                if log.endswith('.log'):
                    os.rename(untar_dir + '/' + log, current_dir + '/' + output)

Extract 20170405_3_play.log.tar.gz
Extract 20170414_1_play.log.tar.gz
Extract 20170508_1_play.log.tar.gz
Extract 20170406_2_play.log.tar.gz
Extract 20170505_3_play.log.tar.gz
Extract 20170420_1_play.log.tar.gz
Extract 20170419_3_play.log.tar.gz
Extract 20170408_1_play.log.tar.gz
Extract 20170506_2_play.log.tar.gz
Extract 20170411_1_play.log.tar.gz
Extract 20170403_2_play.log.tar.gz
Extract 20170428_3_play.log.tar.gz
Extract 20170425_1_play.log.tar.gz
Extract 20170511_1_play.log.tar.gz
Extract 20170330_3_play.log.tar.gz
Extract 20170503_2_play.log.tar.gz
Extract 20170415_1_play.log.tar.gz
Extract 20170404_3_play.log.tar.gz
Extract 20170407_2_play.log.tar.gz
Extract 20170509_1_play.log.tar.gz
Extract 20170504_3_play.log.tar.gz
Extract 20170430_3_play.log.tar.gz
Extract 20170421_1_play.log.tar.gz
Extract 20170507_2_play.log.tar.gz
Extract 20170409_1_play.log.tar.gz
Extract 20170418_3_play.log.tar.gz
Extract 20170401_3_play.log.tar.gz
Extract 20170410_1_play.log.tar.gz
Extract 20170402_2_p

Extract 20170406_3_down.log.tar.gz
Extract 20170405_2_down.log.tar.gz
Extract 20170512_1_down.log.tar.gz
Extract 20170503_3_down.log.tar.gz
Extract 20170426_1_down.log.tar.gz
Extract 20170330_2_down.log.tar.gz
Extract 20170428_2_down.log.tar.gz
Extract 20170403_3_down.log.tar.gz
Extract 20170412_1_down.log.tar.gz
Extract 20170422_1_down.log.tar.gz
Extract 20170507_3_down.log.tar.gz
Extract 20170418_2_down.log.tar.gz
Extract 20170430_2_down.log.tar.gz
Extract 20170504_2_down.log.tar.gz
Extract 20170407_3_down.log.tar.gz
Extract 20170416_1_down.log.tar.gz
Extract 20170404_2_down.log.tar.gz
Extract 20170502_3_down.log.tar.gz
Extract 20170427_1_down.log.tar.gz
Extract 20170501_2_down.log.tar.gz
Extract 20170331_2_down.log.tar.gz
Extract 20170429_2_down.log.tar.gz
Extract 20170413_1_down.log.tar.gz
Extract 20170402_3_down.log.tar.gz
Extract 20170401_2_down.log.tar.gz
Extract 20170421_2_search.log.tar.gz
Extract 20170509_1_search.log.tar.gz
Extract 20170507_2_search.log.tar.gz
Extract 201704

Check number of tar.gz and log files. They should be the same. If the number of files are different, then this means some log files have the same filename.

In `play/` direct the log files, the number of log file must less than the number of tar.gz file because there are two problematic tar.gz files.

In [6]:
for subdir in ['play', 'down', 'search']:
    current_dir = data_dir + subdir
    
    files = os.listdir(current_dir)
    
    tarballs = []
    logs = []
    for f in files:
        if f.endswith('.tar.gz'):
            tarballs.append(f)
        elif f.endswith('.log'):
            logs.append(f)
        elif f == 'untar':
            os.rmdir(current_dir + '/' + f)
        else:
            print('%s is not a tar.gz file and not a log file.' % f)
    
    if len(tarballs) != len(logs):
        print('In %s, number of files are different.' % current_dir)
        print('Number of tar.gz = %d' % len(tarballs))
        print('Number of log = %d' % len(logs))

In /Users/ytshen/Desktop/Machine_Learning/Music_Box/data/play, number of files are different.
Number of tar.gz = 128
Number of log = 126


## 4. Use shell script to combine log file

* `wc -l down/all_down_log play/all_play_log search/all_search_log`
```bash
7737424 down/all_down_log
145751036 play/all_play_log
8640336 search/all_search_log
```

#### play

In [None]:
play_contents =r'''#!/bin/bash
cd /Users/ytshen/Desktop/Machine_Learning/Music_Box/data/play

for f in *.log
do
    echo "Processing $f"
    awk -v var="$f" '{print $0,"\t",substr(var,1,8)}' $f > ${f}.fn
done

cat *.log.fn > all_play_log
#rm *.log
rm *.log.fn
'''

with open('data/play.sh', 'w') as fout:
    fout.write(play_contents)

#### down

In [None]:
down_contents = r'''#!/bin/bash
cd /Users/ytshen/Desktop/Machine_Learning/Music_Box/data/down

for f in *.log
do
    echo "Processing $f"
    awk -v var="$f" '{print $0,"\t",substr(var,1,8)}' $f > ${f}.fn
done

# cat all log with filename to one file
cat *.log.fn > all_down_log
#rm *.log
rm *.log.fn
'''

with open('data/down.sh', 'w') as fout:
    fout.write(down_contents)

#### search

In [None]:
search_contents = r'''#!/bin/bash
cd /Users/ytshen/Desktop/Machine_Learning/Music_Box/data/search/

for f in *.log
do
    echo "Processing $f"
    awk -v var="$f" '{print $0,"\t",substr(var,1,8)}' $f > ${f}.fn
done

# cat all log with filename to one file
cat *.log.fn > all_search_log
#rm *.log
rm *.log.fn
'''

with open('data/search.sh', 'w') as fout:
    fout.write(search_contents)

Execute shell script

In [None]:
import os
# os.system('bash data/play.sh')
# os.system('bash data/down.sh')
# os.system('bash data/search.sh')

## 5. Remove carriage returm (^M)

* Some lines have ^M (carrige return) which must to remove.
  * See [How to remove ^M from a text file and replace it with the next line](https://stackoverflow.com/questions/11755208/how-to-remove-m-from-a-text-file-and-replace-it-with-the-next-line).
  * Above doesn't work, I use `sed -e 's/\r//g' oldfile > newfile` to remove ^M.

In [7]:
# Use sed to remove ^M in the log files.
for subdir in ['play', 'down', 'search']:
    current_dir = data_dir + subdir
    logs = []
    for f in os.listdir(current_dir):
        if f.endswith('.log'):
            logs.append(current_dir + '/' + f)
        
    for log in logs:
        cmd = "sed -e 's/\r//g' " + log + "> " + log + ".new"
        os.system(cmd)
        os.rename(log + ".new", log)

## 6. Combine log files into one large log

* Combine all log files in the `play`, `down`, and `search` directories.
  * Add the date information into file.
* Because we have combined log files `all_play.log`, `all_down.log`, `all_search.log`, we can delete the other log files.
* Need to use `encoding='latin-1'` when open files for reading and writing, otherwise the simplified chineses cannot be displayed correctly.

In [8]:
def combine_log_files(current_dir):
    subdir = current_dir.split('/')[-1]
    output = current_dir + '/all_' + subdir + '.log'
    # if output exists, then remove it
    if os.path.isfile(output):
        os.remove(output)
    
    logs = []
    for f in os.listdir(current_dir):
        if f.endswith('.log'):
            logs.append(current_dir + '/' + f)

    with open(output, 'w', encoding='latin-1') as fout:
        for logfile in logs:
            date = logfile.split('/')[-1].split('_')[0]
            with open(logfile, 'r', encoding='latin-1') as fin:
                for line in fin:
                    new_line = line.replace(r'\r', '').replace('\n', '\t' + date + '\n')
#                     print(new_line)
#                     print(new_line.encode('latin-1').decode('utf-8'))
                    fout.write(new_line)

In [9]:
for subdir in ['play', 'down', 'search']:
    combine_log_files(data_dir + subdir)

## 7. Remove log files

In [10]:
for subdir in ['play', 'down', 'search']:
    current_dir = data_dir + subdir
    all_log = '/all_' + subdir + '.log'
    
    # move combined log file to parent directory
    os.rename(current_dir + all_log, data_dir + all_log)
    
    # remove all small log files
    logs = []
    for f in os.listdir(current_dir):
        if f.endswith('.log'):
            os.remove(current_dir + '/' + f)