In [2]:
from pyspark.sql import SparkSession
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from pyspark.sql import functions as F
from pyspark.sql import Window
import pandas as py
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

spark = SparkSession.builder \
    .master('local') \
    .appName('first_model') \
    .getOrCreate()

In [3]:
trans=spark.read.csv('/Users/xuetong/customer_churn/data_partition/p0/transactions.csv',inferSchema=True, header=True)

In [4]:
trans=trans\
    .withColumn("transaction_date",trans["transaction_date"].cast('string'))\
    .withColumn("membership_expire_date",trans["membership_expire_date"].cast('string'))

trans = trans\
        .withColumn(
            'transaction_date',
                F.to_date(
                    F.unix_timestamp('transaction_date', 'yyyyMMdd').cast('timestamp')))\
        .withColumn(
            'membership_expire_date',
                F.to_date(
                    F.unix_timestamp('membership_expire_date', 'yyyyMMdd').cast('timestamp')))

## Labeling Data

In [5]:
def create_lable(df,churn_days):
    '''
    label whether a transaction of a customer will churn, churn is defined by without membership with more than 
    the parameter churn_days
    params
    -------
    df: spark dataframe, where transaction_date and membership_expire_date must be datetype 
    churn_days: days without membership to be a churned customer
    
    returns
    -------
    df:spark dataframe
    '''
    df = df.orderBy('msno','transaction_date','membership_expire_date')
    w = Window.partitionBy('msno').orderBy('transaction_date','membership_expire_date')
    df = df.withColumn('next_trans_date',F.lead(F.col('transaction_date')).over(w))
    df = df.withColumn('diff_time',F.datediff(df.next_trans_date,df.membership_expire_date))
    df = df.withColumn ('churn',df.diff_time>churn_days)
    df = df.withColumn('churn_date', F.when(df.churn == True, F.date_add(df.membership_expire_date,churn_days+1)))
    return df

In [6]:
trans=create_lable(trans,30)
trans.limit(5).toPandas()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,next_trans_date,diff_time,churn,churn_date
0,+uYayEzlryVEc1b148hc46DU6sr/YHnPE4OCgPZsQGw=,35,7,0,0,0,2015-06-13,2015-06-15,0,,,,
1,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,35,7,0,0,0,2016-11-20,2016-11-27,0,,,,
2,37rQxo+XZR9P0KnW4yVweOqJDpG3vdOX4sIfwCUGGUc=,23,0,0,149,1,2015-03-31,2015-04-30,0,2015-06-30,61.0,True,2015-05-31
3,37rQxo+XZR9P0KnW4yVweOqJDpG3vdOX4sIfwCUGGUc=,23,0,0,149,1,2015-06-30,2015-07-31,0,2015-07-31,0.0,False,
4,37rQxo+XZR9P0KnW4yVweOqJDpG3vdOX4sIfwCUGGUc=,23,0,0,149,1,2015-07-31,2015-08-31,0,2015-08-31,0.0,False,


In [7]:
trans=trans.withColumnRenamed('msno','msno1')

## Feature Engineering

In [10]:
############features from transaction dataset
def trans_feature(trans):
    '''
    create new feature for transaction dataframe
    params
    -------
    trans: spark dataframe, transaction
    
    returns
    -------
    trans: spark dataframe, transaction
    '''
    
    ## price difference
    trans = trans.withColumn('price_diff',trans.plan_list_price-trans.actual_amount_paid)
    ## price per day 
    trans = trans.withColumn('amt_per_day',trans.actual_amount_paid/trans.payment_plan_days)
    ## any discount 
    trans = trans.withColumn('is_discount',F.when(trans.price_diff>0,1).otherwise(0))
    ##previous transaction date 
    w2 = Window.partitionBy('msno1').orderBy('transaction_date','membership_expire_date')
    trans = trans.withColumn('prev_trans_date',F.lag(trans.transaction_date).over(w2))
    ## whether previous transaction is canceled
    trans = trans.withColumn('prev_canceled',F.lag(trans.is_cancel).over(w2))
    ## how long ago is previous transaction 
    trans = trans.withColumn('prev_tran_diff',F.datediff(trans.transaction_date,trans.prev_trans_date))
    return trans

In [11]:
trans = trans_feature(trans)

In [12]:
##########features from user logs dataset 
logs=spark.read.csv('/Users/xuetong/customer_churn/data_partition/p0/logs.csv',inferSchema=True,header=True)
### turn date from integer type to datetype
logs=logs.withColumn("date",logs.date.cast('string'))
logs=logs.withColumn("date",F.to_date(F.unix_timestamp('date','yyyyMMdd').cast('timestamp')))

def log_feature(logs):
    '''
    create temperal feature from logs data frame where column 'date' should be datetype 
    params
    -------
    logs: spark dataframe,logs
    
    returns
    -------
    logs: spark dataframe.logs
    '''
    
    #relative temporal data
    days = lambda i: i * 86400 
    ## two week window 
    window3 = Window.partitionBy(logs.msno).orderBy(logs.date.cast("timestamp").cast("long")).rangeBetween(-days(14),0)
    ## 1 month window
    window4 = Window.partitionBy(logs.msno).orderBy(logs.date.cast("timestamp").cast("long")).rangeBetween(-days(30),0)
    ## user history window 
    window5 = Window.partitionBy('msno').orderBy('date')
    ## previous 2 month to 1 month window
    window6 = Window.partitionBy(logs.msno).orderBy(logs.date.cast("timestamp").cast("long")).rangeBetween(-days(60),0)

    #######  unique songs played
    # sum of daily unique song over last two weeks 
    logs = logs.withColumn('fourtheen_sum_uniq', F.sum(logs.num_unq).over(window3))
    # max of daily unique song over last two weeks 
    logs = logs.withColumn('fourtheen_max_uniq', F.max(logs.num_unq).over(window3))
    # average of daily unique song over last two weeks 
    logs = logs.withColumn('fourtheen_avg_uniq', logs.fourtheen_sum_uniq/14)
    #standard deviation of daily unique song over last two weeks
    logs = logs.withColumn('std_uniq_foutheen_days', F.stddev(logs.num_unq).over(window3))


    # sum of daily unique song over last 30 days
    logs = logs.withColumn('thirty_sum_uniq', F.sum(logs.num_unq).over(window4))
    # max of daily unique song over last 30 days
    logs = logs.withColumn('thirty_max_uniq', F.max(logs.num_unq).over(window4))
    # average of daily unique song over 30 days  
    logs = logs.withColumn('thirty_avg_uniq', logs.thirty_sum_uniq/30)
    #standard deviation of daily unique song over last 30 days
    logs = logs.withColumn('std_uniq_thirty_days', F.stddev(logs.num_unq).over(window4))

    # sum of daily unique song over last 60 days
    logs = logs.withColumn('sixty_sum_uniq', F.sum(logs.num_unq).over(window6))
    # max of daily unique song over last 60 days
    logs = logs.withColumn('sixty_max_uniq', F.max(logs.num_unq).over(window6))
    # average of daily unique song over last 60 days
    logs = logs.withColumn('sixty_avg_uniq', logs.sixty_sum_uniq/30)
    #standard deviation of daily unique song over last 60 days
    logs = logs.withColumn('std_uniq_thirty_days', F.stddev(logs.num_unq).over(window6))

    # difference of average number of unique songs between previous 14 days and previous 30 days
    logs = logs.withColumn('diff_uniq_avg_14_30', logs.fourtheen_avg_uniq - logs.thirty_avg_uniq)
    logs = logs.withColumn('diff_uniq_avg_30_60', logs.thirty_avg_uniq - logs.sixty_avg_uniq)
    # cumulative sum of unique song in history
    logs = logs.withColumn('unique_song_cum_sum', F.sum(logs.num_unq).over(window5))


    #######  total senconds played 
    # daily total seconds over last 14 days
    logs = logs.withColumn('sum_secs_fourtheen_days',F.sum(logs.total_secs).over(window3))
    logs = logs.withColumn('max_secs_fourtheen_days',F.max(logs.total_secs).over(window3))
    logs = logs.withColumn('avg_secs_foutheen_days',logs.sum_secs_fourtheen_days/14)
    logs = logs.withColumn('std_sec_foutheen_days', F.stddev(logs.total_secs).over(window3))
    # daily total seconds over last 30 days
    logs = logs.withColumn('sum_secs_thirty_days',F.sum(logs.total_secs).over(window4))
    logs = logs.withColumn('max_secs_thirty_days',F.max(logs.total_secs).over(window4))
    logs = logs.withColumn('avg_secs_thirty_days',logs.sum_secs_thirty_days/30)
    logs = logs.withColumn('std_sec_thirty_days', F.stddev(logs.total_secs).over(window4))
    #daily total seconds over last 60 to last 30 days
    logs = logs.withColumn('total_secs_sixty_days',F.sum(logs.total_secs).over(window6))
    logs = logs.withColumn('max_secs_sixty_days',F.max(logs.total_secs).over(window6))
    logs = logs.withColumn('avg_secs_sixty_days',logs.total_secs_sixty_days/30)
    logs = logs.withColumn('std_sec_sixty_days', F.stddev(logs.total_secs).over(window6))
    # change in average daily total seconds between previous 30 days and previous 60 days 
    logs = logs.withColumn('diff_sec_avg_14_30', logs.avg_secs_foutheen_days - logs.avg_secs_thirty_days)
    logs = logs.withColumn('diff_sec_avg_30_60', logs.avg_secs_thirty_days - logs.avg_secs_sixty_days)

    # sum of seconds on prior prior month
    window7 = Window.partitionBy(logs.msno).orderBy(logs.date.cast("timestamp").cast("long")).rangeBetween(-days(60),-days(30))

    logs = logs.withColumn('sum_secs_tow_month',F.sum(logs.total_secs).over(window7))

    ## overall logged in day 
    logs = logs.withColumn('total_login_day',F.count(logs.date).over(window5))

    ##day since last login
    logs=logs.withColumn('last_login', F.lag(logs.date).over(window5))
    logs=logs.withColumn('days_since_last_login',F.datediff(logs.last_login,logs.date))
    
    return logs

In [9]:
# ##num 25 features 
# logs = logs.withColumn('fourtheen_sum_25', F.sum(logs.num_25).over(window3))
# logs = logs.withColumn('thirty_sum_25',F.sum(logs.num_25).over(window4))
# logs = logs.withColumn('fourtheen_avg_25', logs.fourtheen_sum_25/14)
# logs = logs.withColumn('thirty_avg_25', logs.thirty_sum_25/30)
# logs = logs.withColumn('diff_2week_25', logs.fourtheen_avg_25-logs.thirty_avg_25)
# logs = logs.withColumn('std_25_one_month', F.stddev(logs.num_25).over(window4))

# ##num 50 features 
# logs = logs.withColumn('fourtheen_sum_50', F.sum(logs.num_50).over(window3))
# logs = logs.withColumn('thirty_sum_50',F.sum(logs.num_50).over(window4))
# logs = logs.withColumn('fourtheen_avg_50', logs.fourtheen_sum_50/14)
# logs = logs.withColumn('thirty_avg_50', logs.thirty_sum_50/30)
# logs = logs.withColumn('diff_2week_50', logs.fourtheen_avg_50-logs.thirty_avg_50)
# logs = logs.withColumn('std_50_one_month', F.stddev(logs.num_50).over(window4))

# ##num 75 features 
# logs = logs.withColumn('fourtheen_sum_75', F.sum(logs.num_75).over(window3))
# logs = logs.withColumn('thirty_sum_75',F.sum(logs.num_75).over(window4))
# logs = logs.withColumn('fourtheen_avg_75', logs.fourtheen_sum_75/14)
# logs = logs.withColumn('thirty_avg_75', logs.thirty_sum_75/30)
# logs = logs.withColumn('diff_2week_75', logs.fourtheen_avg_75-logs.thirty_avg_75)
# logs = logs.withColumn('std_75_one_month', F.stddev(logs.num_75).over(window4))

# ##num 985 feature
# logs = logs.withColumn('fourtheen_sum_985', F.sum(logs.num_985).over(window3))
# logs = logs.withColumn('thirty_sum_985',F.sum(logs.num_985).over(window4))
# logs = logs.withColumn('fourtheen_avg_985', logs.fourtheen_sum_985/14)
# logs = logs.withColumn('thirty_avg_985', logs.thirty_sum_985/30)
# logs = logs.withColumn('diff_2week_985', logs.fourtheen_avg_985-logs.thirty_avg_985)
# logs = logs.withColumn('std_985_one_month', F.stddev(logs.num_985).over(window4))

# ##num 100 feature
# logs = logs.withColumn('fourtheen_sum_100', F.sum(logs.num_100).over(window3))
# logs = logs.withColumn('thirty_sum_100',F.sum(logs.num_100).over(window4))
# logs = logs.withColumn('fourtheen_avg_100', logs.fourtheen_sum_100/14)
# logs = logs.withColumn('thirty_avg_100', logs.thirty_sum_100/30)
# logs = logs.withColumn('diff_2week_100', logs.fourtheen_avg_100-logs.thirty_avg_100)
# logs = logs.withColumn('std_100_one_month', F.stddev(logs.num_100).over(window4))



In [14]:
logs = log_feature(logs)

In [15]:
logs.limit(5).toPandas()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,fourtheen_sum_uniq,...,total_secs_sixty_days,max_secs_sixty_days,avg_secs_sixty_days,std_sec_sixty_days,diff_sec_avg_14_30,diff_sec_avg_30_60,sum_secs_tow_month,total_login_day,last_login,days_since_last_login
0,+uYayEzlryVEc1b148hc46DU6sr/YHnPE4OCgPZsQGw=,2015-06-13,3,0,0,0,1,4,390.891,4,...,390.891,390.891,13.0297,,14.891086,0.0,,1,,
1,/nTKo6fPYX88w+22j72VcZvY0FRR6OqerS9JHcGCD9A=,2015-11-14,0,1,1,0,1,2,520.045,2,...,520.045,520.045,17.334833,,19.811238,0.0,,1,,
2,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,2016-06-16,1,0,0,0,17,17,3873.695,17,...,3873.695,3873.695,129.123167,,147.569333,0.0,,1,,
3,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,2016-06-23,0,0,0,1,1,2,489.802,19,...,4363.497,3873.695,145.4499,2392.773687,166.228457,0.0,,2,2016-06-16,-7.0
4,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,2016-11-20,2,2,0,0,0,2,150.792,2,...,150.792,150.792,5.0264,,5.744457,0.0,,3,2016-06-23,-150.0


In [16]:
## filtering out date with unknown label
trans = trans.filter(trans.transaction_date <= F.unix_timestamp(F.lit('2017-01-29 00:00:00')).cast('timestamp'))
## join logs to transaction 
trans_logs = trans.join(logs,(trans.msno1 == logs.msno)&(logs.date.\
                                between(trans.transaction_date,trans.membership_expire_date)), how = 'left')

In [17]:
trans_logs.limit(5).toPandas()

Unnamed: 0,msno1,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,next_trans_date,...,total_secs_sixty_days,max_secs_sixty_days,avg_secs_sixty_days,std_sec_sixty_days,diff_sec_avg_14_30,diff_sec_avg_30_60,sum_secs_tow_month,total_login_day,last_login,days_since_last_login
0,+uYayEzlryVEc1b148hc46DU6sr/YHnPE4OCgPZsQGw=,35,7,0,0,0,2015-06-13,2015-06-15,0,,...,390.891,390.891,13.0297,,14.891086,0.0,,1,,
1,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,35,7,0,0,0,2016-11-20,2016-11-27,0,,...,150.792,150.792,5.0264,,5.744457,0.0,,3,2016-06-23,-150.0
2,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,35,7,0,0,0,2016-11-20,2016-11-27,0,,...,863.432,712.64,28.781067,397.286531,32.892648,0.0,,4,2016-11-20,-1.0
3,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,35,7,0,0,0,2016-11-20,2016-11-27,0,,...,2030.138,1166.706,67.671267,508.909023,77.33859,0.0,,5,2016-11-21,-2.0
4,1vBzVoPyEXo5ehJkkb27ebI9zrgDwAt31cjFM2HY62k=,35,7,0,0,0,2016-11-20,2016-11-27,0,,...,2160.272,1166.706,72.009067,497.338908,82.296076,0.0,,6,2016-11-23,-1.0


In [18]:
### making aggregated a=feature 
agg_col=[
        'num_25','num_50','num_75','num_985','num_100','num_unq','total_secs','fourtheen_sum_uniq',
         'fourtheen_max_uniq','fourtheen_avg_uniq','std_uniq_foutheen_days','thirty_sum_uniq','thirty_max_uniq',
         'thirty_avg_uniq','std_uniq_thirty_days','sixty_sum_uniq','sixty_max_uniq','sixty_avg_uniq',
         'diff_uniq_avg_14_30','diff_uniq_avg_30_60','unique_song_cum_sum','sum_secs_fourtheen_days',
         'max_secs_fourtheen_days','avg_secs_foutheen_days','std_sec_foutheen_days','sum_secs_thirty_days',
         'max_secs_thirty_days','avg_secs_thirty_days','std_sec_thirty_days','total_secs_sixty_days',
         'max_secs_sixty_days','avg_secs_sixty_days','std_sec_sixty_days','diff_sec_avg_14_30','diff_sec_avg_30_60',
         'sum_secs_tow_month','total_login_day','days_since_last_login'
        ]
funcs=[F.mean,F.stddev,F.max,F.min]
exprs = [f(F.col(c)) for f in funcs for c in agg_col]+[F.count(F.col('date')),F.max(F.col('date'))]

trans_logs_agg=trans_logs.groupby('msno1','transaction_date','membership_expire_date').agg(*exprs)

trans_logs_agg=trans_logs_agg.withColumnRenamed('msno1','msno2')

trans_logs_agg=trans_logs_agg.withColumnRenamed('transaction_date','transaction_date2')
trans_logs_agg=trans_logs_agg.withColumnRenamed('membership_expire_date','membership_expire_date2')
##join aggragated feature back to transaction dataframe
training = trans.join(trans_logs_agg,(trans.msno1 == trans_logs_agg.msno2)&(trans.transaction_date==trans_logs_agg.transaction_date2)&\
                    (trans.membership_expire_date == trans_logs_agg.membership_expire_date2),how='left')

In [19]:
training.limit(5).toPandas()

Unnamed: 0,msno1,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,next_trans_date,...,min(max_secs_sixty_days),min(avg_secs_sixty_days),min(std_sec_sixty_days),min(diff_sec_avg_14_30),min(diff_sec_avg_30_60),min(sum_secs_tow_month),min(total_login_day),min(days_since_last_login),count(date),max(date)
0,0AML3+272gUMWxJ4Zxp9fVLSuGqBtA9SaCweNvDOM6k=,41,30,149,149,1,2015-04-20,2015-05-20,0,2015-05-07,...,5712.15,1760.6564,1501.232273,-155.027629,-1164.413633,9696.336,49,-5,21,2015-05-19
1,3wcJrYsgwMYxwUyD98Z/hRsVYMFv+frQHv4IewDzGK4=,41,30,149,149,1,2015-02-14,2015-03-14,0,2015-03-14,...,6969.925,485.389967,2010.920022,-291.131443,-443.275,6328.325,5,-12,6,2015-03-12
2,8Vj8EFTpFQjsYkdBMiuMIB4jRwVBwndmHQwKHRe/kxo=,41,30,129,129,1,2015-02-27,2015-03-28,0,2015-03-27,...,1160270.002,41186.260633,190361.677607,-38872.045286,-39896.082067,47486.474,35,-7,15,2015-03-28
3,2Nkp4ci2p4kZAqzU9Lov4DCuuK9vh/VCRZ+Z2LLmvjI=,40,30,149,149,1,2015-05-28,2015-07-01,0,2015-06-28,...,7180.539,2304.7575,1967.998932,-120.94891,-3132.7737,18166.569,82,-6,20,2015-06-27
4,7F7UmjPDMmTPFtV5NS3uzJjEH8X4j+Eyh8gmoe6zISA=,39,30,149,149,1,2016-07-31,2016-09-15,0,2016-08-31,...,33408.487,8470.1919,7657.232839,-3181.754848,-18498.9425,183529.241,101,-9,24,2016-09-15


In [20]:
from pyspark.ml.feature import VectorAssembler,StandardScaler

In [21]:
featureCol = [
 'payment_method_id',
 'payment_plan_days',
 'plan_list_price',
 'actual_amount_paid',
 'is_auto_renew',
 'is_cancel',
 'price_diff',
 'amt_per_day',
 'is_discount',
 'prev_canceled',
 'prev_tran_diff',
 'avg(num_25)',
 'avg(num_50)',
 'avg(num_75)',
 'avg(num_985)',
 'avg(num_100)',
 'avg(num_unq)',
 'avg(total_secs)',
 'avg(fourtheen_sum_uniq)',
 'avg(fourtheen_max_uniq)',
 'avg(fourtheen_avg_uniq)',
 'avg(std_uniq_foutheen_days)',
 'avg(thirty_sum_uniq)',
 'avg(thirty_max_uniq)',
 'avg(thirty_avg_uniq)',
 'avg(std_uniq_thirty_days)',
 'avg(sixty_sum_uniq)',
 'avg(sixty_max_uniq)',
 'avg(sixty_avg_uniq)',
 'avg(diff_uniq_avg_14_30)',
 'avg(diff_uniq_avg_30_60)',
 'avg(unique_song_cum_sum)',
 'avg(sum_secs_fourtheen_days)',
 'avg(max_secs_fourtheen_days)',
 'avg(avg_secs_foutheen_days)',
 'avg(std_sec_foutheen_days)',
 'avg(sum_secs_thirty_days)',
 'avg(max_secs_thirty_days)',
 'avg(avg_secs_thirty_days)',
 'avg(std_sec_thirty_days)',
 'avg(total_secs_sixty_days)',
 'avg(max_secs_sixty_days)',
 'avg(avg_secs_sixty_days)',
 'avg(std_sec_sixty_days)',
 'avg(diff_sec_avg_14_30)',
 'avg(diff_sec_avg_30_60)',
 'avg(sum_secs_tow_month)',
 'avg(total_login_day)',
 'avg(days_since_last_login)',
 'stddev_samp(num_25)',
 'stddev_samp(num_50)',
 'stddev_samp(num_75)',
 'stddev_samp(num_985)',
 'stddev_samp(num_100)',
 'stddev_samp(num_unq)',
 'stddev_samp(total_secs)',
 'stddev_samp(fourtheen_sum_uniq)',
 'stddev_samp(fourtheen_max_uniq)',
 'stddev_samp(fourtheen_avg_uniq)',
 'stddev_samp(std_uniq_foutheen_days)',
 'stddev_samp(thirty_sum_uniq)',
 'stddev_samp(thirty_max_uniq)',
 'stddev_samp(thirty_avg_uniq)',
 'stddev_samp(std_uniq_thirty_days)',
 'stddev_samp(sixty_sum_uniq)',
 'stddev_samp(sixty_max_uniq)',
 'stddev_samp(sixty_avg_uniq)',
 'stddev_samp(diff_uniq_avg_14_30)',
 'stddev_samp(diff_uniq_avg_30_60)',
 'stddev_samp(unique_song_cum_sum)',
 'stddev_samp(sum_secs_fourtheen_days)',
 'stddev_samp(max_secs_fourtheen_days)',
 'stddev_samp(avg_secs_foutheen_days)',
 'stddev_samp(std_sec_foutheen_days)',
 'stddev_samp(sum_secs_thirty_days)',
 'stddev_samp(max_secs_thirty_days)',
 'stddev_samp(avg_secs_thirty_days)',
 'stddev_samp(std_sec_thirty_days)',
 'stddev_samp(total_secs_sixty_days)',
 'stddev_samp(max_secs_sixty_days)',
 'stddev_samp(avg_secs_sixty_days)',
 'stddev_samp(std_sec_sixty_days)',
 'stddev_samp(diff_sec_avg_14_30)',
 'stddev_samp(diff_sec_avg_30_60)',
 'stddev_samp(sum_secs_tow_month)',
 'stddev_samp(total_login_day)',
 'stddev_samp(days_since_last_login)',
 'max(num_25)',
 'max(num_50)',
 'max(num_75)',
 'max(num_985)',
 'max(num_100)',
 'max(num_unq)',
 'max(total_secs)',
 'max(fourtheen_sum_uniq)',
 'max(fourtheen_max_uniq)',
 'max(fourtheen_avg_uniq)',
 'max(std_uniq_foutheen_days)',
 'max(thirty_sum_uniq)',
 'max(thirty_max_uniq)',
 'max(thirty_avg_uniq)',
 'max(std_uniq_thirty_days)',
 'max(sixty_sum_uniq)',
 'max(sixty_max_uniq)',
 'max(sixty_avg_uniq)',
 'max(diff_uniq_avg_14_30)',
 'max(diff_uniq_avg_30_60)',
 'max(unique_song_cum_sum)',
 'max(sum_secs_fourtheen_days)',
 'max(max_secs_fourtheen_days)',
 'max(avg_secs_foutheen_days)',
 'max(std_sec_foutheen_days)',
 'max(sum_secs_thirty_days)',
 'max(max_secs_thirty_days)',
 'max(avg_secs_thirty_days)',
 'max(std_sec_thirty_days)',
 'max(total_secs_sixty_days)',
 'max(max_secs_sixty_days)',
 'max(avg_secs_sixty_days)',
 'max(std_sec_sixty_days)',
 'max(diff_sec_avg_14_30)',
 'max(diff_sec_avg_30_60)',
 'max(sum_secs_tow_month)',
 'max(total_login_day)',
 'max(days_since_last_login)',
 'min(num_25)',
 'min(num_50)',
 'min(num_75)',
 'min(num_985)',
 'min(num_100)',
 'min(num_unq)',
 'min(total_secs)',
 'min(fourtheen_sum_uniq)',
 'min(fourtheen_max_uniq)',
 'min(fourtheen_avg_uniq)',
 'min(std_uniq_foutheen_days)',
 'min(thirty_sum_uniq)',
 'min(thirty_max_uniq)',
 'min(thirty_avg_uniq)',
 'min(std_uniq_thirty_days)',
 'min(sixty_sum_uniq)',
 'min(sixty_max_uniq)',
 'min(sixty_avg_uniq)',
 'min(diff_uniq_avg_14_30)',
 'min(diff_uniq_avg_30_60)',
 'min(unique_song_cum_sum)',
 'min(sum_secs_fourtheen_days)',
 'min(max_secs_fourtheen_days)',
 'min(avg_secs_foutheen_days)',
 'min(std_sec_foutheen_days)',
 'min(sum_secs_thirty_days)',
 'min(max_secs_thirty_days)',
 'min(avg_secs_thirty_days)',
 'min(std_sec_thirty_days)',
 'min(total_secs_sixty_days)',
 'min(max_secs_sixty_days)',
 'min(avg_secs_sixty_days)',
 'min(std_sec_sixty_days)',
 'min(diff_sec_avg_14_30)',
 'min(diff_sec_avg_30_60)',
 'min(sum_secs_tow_month)',
 'min(total_login_day)',
 'min(days_since_last_login)',
 'count(date)',
  ]

In [22]:
training=training.na.fill({'churn':True})
df_feature=training.select(*featureCol+['churn'])
df_feature=df_feature.na.fill({'prev_tran_diff':999}).na.fill(0)

## Building model

In [23]:
assembler = VectorAssembler(inputCols=featureCol, outputCol="features") 
assembled_df = assembler.transform(df_feature)
assembled_df= assembled_df.withColumn('churn',assembled_df.churn.cast('integer'))

In [24]:
SEED=2019
train_data, test_data = assembled_df.randomSplit([.8,.2], seed=SEED)

In [25]:
rfClassifer = RandomForestClassifier(labelCol = "churn",featuresCol='features')
model=rfClassifer.fit(train_data)

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
predictions = model.transform(test_data)

In [27]:
predictions.select('rawPrediction','probability','prediction').limit(5).toPandas()

Unnamed: 0,rawPrediction,probability,prediction
0,"[18.727442105697357, 1.272557894302645]","[0.9363721052848678, 0.06362789471513225]",0.0
1,"[19.276270493700856, 0.7237295062991448]","[0.9638135246850428, 0.03618647531495724]",0.0
2,"[19.276270493700856, 0.7237295062991448]","[0.9638135246850428, 0.03618647531495724]",0.0
3,"[19.32825544749436, 0.6717445525056416]","[0.966412772374718, 0.03358722762528208]",0.0
4,"[18.555044049252096, 1.4449559507479068]","[0.9277522024626047, 0.07224779753739533]",0.0
