In this notebook we will explore how to define the concept "churn" and how we will predict the churn rate that we define.

In [2]:
import configparser 
from datetime import datetime 
import os 
from pyspark.sql import SparkSession 
from pyspark.sql.functions import udf, col 
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
config = configparser.ConfigParser() 

In [4]:
os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id 
os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
os.environ['AWS_REGION_NAME']='us-west-1'
def create_spark_session(): 
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("fs.s3n.awsAccessKeyId", aws_access_key_id) \
        .config("fs.s3n.awsSecretAccessKey" , aws_secret_access_key) \
        .getOrCreate() 
    return spark
spark = create_spark_session() 
event_data = "s3a://udacity-dsnd/sparkify/sparkify_event_data.json"
print("reading data")
df = spark.read.json(event_data)

In [5]:
%run Users/tw2567@columbia.edu/Udacity/churn_prediction/util

In [6]:
dp = DataPreparation(df)

In [7]:
user_activities = dp.user_activities

In [8]:
display(user_activities.select('days_before_today').groupby().max())

max(days_before_today)
61


Unfortunately we only have customers' activity data for 2 months...

#Definition of "being chruned"
**Churn:** either downgrade from paid level to free level or cancel the service
<br>**Data used to predict:** Data till today
<br>**Customers qualified for prediction:** Customers who are still active today

Ideally we can predict churn activities that will happen for a longer time period (e.g. 14 days after today ~ 30 days after today). But because we only have data for 2 months, we have to cut down the time period a little bit....

We would like to predict the churn activities for a "further" future not for tomorrow because we would want to have some time to act (like running rentention campaigns).

When we train the model, imagine that we collect the customer activity data today. We will use the data in the past 7 days to calculate the "training y" and use the data from day1 till 14 days before. If, in real life, we have data for longer period, we can roll the performance measuring window to get more training data points.

In [11]:
last_status = user_activities\
              .withColumn('rownum', F.row_number().over(Window.partitionBy("userId").orderBy(F.col('time').desc())))\
              .filter('rownum==1')\
              .select('userId',
                      F.when(F.col('page')=='Cancel', 1)\
                       .when(F.col('page')=='Submit Downgrade', 1)\
                       .otherwise(0).alias('last_status_cancelled'))

In [12]:
print(last_status.count())
print(last_status.select('userId').distinct().count())

In [13]:
display(user_activities\
              .withColumn('rownum', F.row_number().over(Window.partitionBy("userId").orderBy(F.col('time').desc())))\
              .filter("rownum!=1 and page='Cancel'"))

userId,registration,registration_ts,level,time,timestamp,time_in_the_day,sessionId,itemInSession,page,artist,singer-song,length,days_since_registration,days_before_today,rownum


In [14]:
churned = last_status.filter("last_status_cancelled = 1")

In [15]:
print("Churn rate: {:.2%}".format(churned.count()/user_activities.select('userId').distinct().count()))

# Prepare training data

In this section we will prepare the data for modeling.

In [17]:
%run Users/tw2567@columbia.edu/Udacity/churn_prediction/util

In [18]:
dp = DataPreparation(df)

In [19]:
featues = dp.run()

Investigate whether any customer has null static features.

In [21]:
featues.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in ['female','customer_age']]).show()

In [22]:
display(featues.filter("customer_age is Null"))

userId,female,state_AZ,state_SC,state_LA,state_MN,state_NJ,state_DC,state_OR,state_VA,state_RI,state_KY,state_WY,state_NH,state_MI,state_NV,state_WI,state_ID,state_CA,state_CT,state_NE,state_MT,state_NC,state_VT,state_MD,state_DE,state_MO,state_IL,state_ME,state_WA,state_ND,state_MS,state_AL,state_IN,state_OH,state_TN,state_IA,state_NM,state_PA,state_SD,state_NY,state_TX,state_WV,state_GA,state_MA,state_KS,state_FL,state_CO,state_AK,state_AR,state_OK,state_UT,state_HI,customer_age,agent_Firefox,agent_Mozilla,agent_Safari,agent_Gecko,agent_GSA,agent_Version,agent_Chromium,agent_Ubuntu,agent_AppleWebKit,agent_Mobile,agent_Chrome,sessions,items,active_days,active_time,sessions_per_day,time_per_day,avg_items_per_session,time_per_session,perc_1_6,perc_6_12,perc_12_18,perc_18_1,unique_songs,unique_artists,max_song_perc,numbers_thup,numbers_thdn,numbers_addlist,numbers_addfrd,numbers_error,perc_thup,perc_thdn,perc_addlist,perc_addfrd,perc_popular_songs,last_status_paid,paid_days,perc_paid_days
1261737,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [23]:
display(df.filter("userId = 1261737").limit(10))

artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
,Logged Out,,,87,,,paid,,GET,Home,,8615,,200,1538352008000,,1261737
,Logged Out,,,0,,,free,,PUT,Login,,7433,,307,1538352041000,,1261737
,Logged Out,,,4,,,free,,GET,Home,,25003,,200,1538352182000,,1261737
,Logged Out,,,2,,,free,,GET,Home,,9930,,200,1538352254000,,1261737
,Logged Out,,,3,,,free,,PUT,Login,,9930,,307,1538352255000,,1261737
,Logged Out,,,0,,,free,,PUT,Login,,23471,,307,1538352259000,,1261737
,Logged Out,,,44,,,free,,GET,Home,,6317,,200,1538352278000,,1261737
,Logged Out,,,43,,,free,,GET,Home,,22951,,200,1538352361000,,1261737
,Logged Out,,,44,,,free,,GET,Home,,22951,,200,1538352365000,,1261737
,Logged Out,,,95,,,paid,,GET,Home,,6071,,200,1538352404000,,1261737


Actually the customer with the ID 1261737 seems wierd.

In [25]:
display(featues.limit(20))

userId,female,state_AZ,state_SC,state_LA,state_MN,state_NJ,state_DC,state_OR,state_VA,state_RI,state_KY,state_WY,state_NH,state_MI,state_NV,state_WI,state_ID,state_CA,state_CT,state_NE,state_MT,state_NC,state_VT,state_MD,state_DE,state_MO,state_IL,state_ME,state_WA,state_ND,state_MS,state_AL,state_IN,state_OH,state_TN,state_IA,state_NM,state_PA,state_SD,state_NY,state_TX,state_WV,state_GA,state_MA,state_KS,state_FL,state_CO,state_AK,state_AR,state_OK,state_UT,state_HI,customer_age,agent_Firefox,agent_Mozilla,agent_Safari,agent_Gecko,agent_GSA,agent_Version,agent_Chromium,agent_Ubuntu,agent_AppleWebKit,agent_Mobile,agent_Chrome,sessions,items,active_days,active_time,sessions_per_day,time_per_day,avg_items_per_session,time_per_session,perc_1_6,perc_6_12,perc_12_18,perc_18_1,unique_songs,unique_artists,max_song_perc,numbers_thup,numbers_thdn,numbers_addlist,numbers_addfrd,numbers_error,perc_thup,perc_thdn,perc_addlist,perc_addfrd,perc_popular_songs,last_status_paid,paid_days,perc_paid_days
1000280,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95,1,1,0,1,0,0,0,0,0,0,0,22,1316,21,257231000,1.1428571428571428,12226666.666666666,59.81818181818182,11692318.181818182,0.273373758775536,0.0754063626589083,0.356836221617861,0.2943836569476946,948,767,0.0080642602217933,53,33,25,14,3,0.040273556231003,0.0250759878419452,0.0189969604863221,0.0106382978723404,0.1604696673189823,0,1,0.0454545454545454
1002185,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,68,0,1,1,0,0,0,0,0,1,0,1,17,2080,21,443434000,1.0952380952380951,21039095.23809524,122.3529411764706,26084352.94117647,0.2328908686377082,0.2284845526890442,0.2889402157101318,0.2496843629631157,1582,1205,0.0073990867521017,92,14,49,25,2,0.0442307692307692,0.0067307692307692,0.0235576923076923,0.0120192307692307,0.1574803149606299,1,21,1.0
1017805,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,90,0,1,1,0,0,1,0,0,1,0,0,3,320,4,63410000,1.0,15784750.0,106.66666666666669,21136666.666666668,0.253279946824505,0.0,0.5325304255622201,0.2141896276132748,245,223,0.0110185522577972,7,4,5,13,0,0.021875,0.0125,0.015625,0.040625,0.14,1,4,1.0
1030587,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,133,0,1,1,0,0,0,0,0,1,0,1,11,1752,13,362380000,1.1538461538461535,27668769.230769232,159.27272727272728,32943636.363636363,0.3534687622918578,0.2864494558804248,0.1513680837813032,0.208713698046414,1345,1071,0.0090008394147621,66,16,46,23,1,0.0376712328767123,0.0091324200913242,0.026255707762557,0.0131278538812785,0.1413043478260869,1,12,0.8571428571428571
1033297,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,120,1,1,0,1,0,0,0,0,0,0,0,5,299,6,59105000,1.0,9821000.0,59.8,11821000.0,0.1752320502320502,0.0772590772590772,0.3840601965601966,0.3634486759486759,234,215,0.0164702094860919,10,3,7,4,0,0.0334448160535117,0.0100334448160535,0.0234113712374581,0.0133779264214046,0.1652542372881356,1,2,0.2857142857142857
1057724,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,98,0,1,1,0,0,0,0,0,1,0,1,40,4669,41,953791000,1.2195121951219512,23195975.609756097,116.725,23844775.0,0.1736602842646172,0.2748881677045873,0.2990831371675157,0.2523684108632797,3190,2157,0.0095950244474216,200,29,135,76,1,0.0428357249946455,0.0062111801242236,0.0289141143713857,0.0162775754979653,0.16246425786327,1,24,0.5714285714285714
1059049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,143,0,1,1,0,0,0,0,0,1,0,1,5,662,5,140598000,1.2,28048200.0,132.4,28119600.0,0.503600432737259,0.1667164347149269,0.208056549556738,0.121626582991076,535,454,0.0185262023362713,29,6,16,10,0,0.0438066465256797,0.0090634441087613,0.0241691842900302,0.0151057401812688,0.1627906976744186,1,5,1.0
1069552,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,129,0,1,1,0,0,0,0,0,1,0,1,12,582,11,111581000,1.0909090909090908,10143727.272727272,48.5,9298416.666666666,0.1167007150153217,0.2631420545746388,0.5229370348752371,0.0972201955348022,442,389,0.0129638551187677,26,6,11,7,1,0.0446735395189003,0.0103092783505154,0.0189003436426116,0.0120274914089347,0.156043956043956,0,3,0.2307692307692307
1071308,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,1,1,0,1,0,0,0,0,0,0,0,18,1692,20,348022000,1.1,17357050.0,94.0,19334555.55555556,0.2535537180399703,0.2625782433077623,0.2769571444890642,0.206910894163203,1272,1007,0.008890880201339,74,12,26,31,1,0.0437352245862884,0.0070921985815602,0.0153664302600472,0.018321513002364,0.1618168914123491,1,20,0.9523809523809524
1076191,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,74,0,1,0,1,0,0,0,0,0,0,0,3,63,3,15384000,1.0,5128000.0,21.0,5128000.0,0.0,0.0,0.1884526558891455,0.8115473441108545,47,47,0.1176331016994726,4,1,1,0,0,0.0634920634920634,0.0158730158730158,0.0158730158730158,0.0,0.1276595744680851,0,0,0.0


In [26]:
featues.filter("userId != 1261737").fillna(0).join(last_status, on="userId")\
.write.parquet("s3://tianyi-wang-data-science-projects/churn-prediction-2020/features1", mode='overwrite')