In [1]:
import os
filename = '/usr/local/Cellar/apache-spark/2.2.1/libexec/python/pyspark/shell.py'
exec(open(filename).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.1
      /_/

Using Python version 3.6.1 (default, May 11 2017 13:04:09)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession

In [3]:
# May take awhile locally
spark = SparkSession.builder.appName("Feature").getOrCreate()

In [4]:
# Let Spark know about the header and infer the Schema types!
df_whole = spark.read.csv('../data/all_play.log.fn', sep = '\t', inferSchema=True, header=True)

In [5]:
# Parsing data from date
# https://stackoverflow.com/questions/46410887/pyspark-string-matching-to-create-new-column

from pyspark.sql.functions import regexp_extract, col

# use regulization expression for string manipulation 
# required feature 1 - date
# e.g. "20170301_play.log" -> "20170301"
df_whole = df_whole.withColumn('date', regexp_extract(col('file_name'), '([0-9]{8})(_)(\w+)', 1))
#df_whole.show()

# required feature 2 - uid
# e.g. "154422682" -> '154422682'(reserve the first night digits)
df_whole = df_whole.withColumn('uid', regexp_extract(col('uid'), '([0-9]{9})', 1))
#df_whole.show()

# optional feature - play_time, song_length, song_id(maybe used for music recommendation)
# select all there feature: date, uid, play_time, song_length, song_id
# restore into a new dataframe
df_select = df_whole.select(['date', 'uid', 'play_time', 'song_length', 'song_id'])

In [6]:
# Drop any row that contains missing data
df_select = df_select.na.drop()

In [7]:
# remove the blank value
df_select = df_select.filter("date != ''")

In [8]:
# replace the error value: 20170339 -> 20170329
from pyspark.sql.functions import *
df_select_remove = df_select.withColumn('date', regexp_replace('date', '20170339', '20170329'))

In [9]:
# show the all possible type of date
df_select_remove.groupBy('date').count().orderBy('date').show(90,False)

+--------+-------+
|date    |count  |
+--------+-------+
|20170301|3421492|
|20170302|2452263|
|20170303|1851942|
|20170304|1709097|
|20170305|1607932|
|20170306|1351465|
|20170307|1288366|
|20170308|1230621|
|20170309|1172860|
|20170329|2193336|
|20170330|4755802|
|20170331|7033246|
|20170401|5792550|
|20170402|5699764|
|20170403|3588991|
|20170404|4941358|
|20170405|3850905|
|20170406|3881751|
|20170407|3807564|
|20170408|4053207|
|20170409|3945463|
|20170410|3435108|
|20170411|2332928|
|20170412|3457415|
|20170413|3380796|
|20170414|2278862|
|20170415|3591673|
|20170416|3558109|
|20170417|3112159|
|20170418|3013580|
|20170419|3021366|
|20170420|3008111|
|20170421|3038619|
|20170422|3247357|
|20170423|3223963|
|20170424|1506606|
|20170425|2819081|
|20170426|2789468|
|20170427|3127340|
|20170428|2952460|
|20170429|3337713|
|20170430|3153132|
|20170501|2953738|
|20170502|2486697|
|20170503|2555248|
|20170504|2516949|
|20170505|2571197|
|20170506|2874456|
|20170507|2767560|
|20170508|25

In [12]:
from pyspark.sql.types import DoubleType, IntegerType, DateType
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType

# https://stackoverflow.com/questions/38080748/convert-pyspark-string-to-date-format

# This function converts the string cell into a date:
func =  udf (lambda x: datetime.strptime(x, '%Y%m%d'), DateType())

df_select_use = df_select_remove.withColumn('date', func(col('date')))

In [11]:
# select the start_date from 2017-03-30
start_date = '2017-03-30'
new_filtered_df_select_use = df_select_use.where(df_select_use.date >= start_date)
new_filtered_df_select_use.show()

+----------+---------+---------+-----------+--------+
|      date|      uid|play_time|song_length| song_id|
+----------+---------+---------+-----------+--------+
|2017-03-30|168550892|      254|        254|23491655|
|2017-03-30|168540455|      189|        190|  298250|
|2017-03-30|168551247|       78|        149|11881432|
|2017-03-30|168549788|       16|        242|  295469|
|2017-03-30|168551248|       87|         87|21393368|
|2017-03-30|168550496|      369|       2747|12495422|
|2017-03-30|168551331|      231|        231|20671171|
|2017-03-30|168535490|      283|        283| 6616004|
|2017-03-30|168539760|      197|        198| 4732048|
|2017-03-30|168551373|       14|        212| 3378911|
|2017-03-30|168544926|        6|         28| 4403788|
|2017-03-30|168551042|      106|        277|  505355|
|2017-03-30|168551026|        2|         31|19477157|
|2017-03-30|168532580|       67|        137|21762903|
|2017-03-30|168551417|       27|        226|  727161|
|2017-03-30|168551430|      

In [13]:
new_filtered_df_select_use.write.csv('new_filtered_df_select_use.csv',header = True)