# Customer Churn Prediction for Sparkify using PySpark

This notebook is used to train the full dataset with Spark on AWS. 

In [1]:
# https://aws.amazon.com/blogs/big-data/install-python-libraries-on-a-running-cluster-with-emr-notebooks/
sc.list_packages()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1606191126161_0009,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Package                    Version  
-------------------------- ---------
beautifulsoup4             4.9.1    
boto                       2.49.0   
click                      7.1.2    
jmespath                   0.10.0   
joblib                     0.16.0   
lxml                       4.5.2    
mysqlclient                1.4.2    
nltk                       3.5      
nose                       1.3.4    
numpy                      1.16.5   
pip                        9.0.1    
py-dateutil                2.2      
python37-sagemaker-pyspark 1.4.0    
pytz                       2020.1   
PyYAML                     5.3.1    
regex                      2020.7.14
setuptools                 28.8.0   
six                        1.13.0   
soupsieve                  1.9.5    
tqdm                       4.48.2   
wheel                      0.29.0   
windmill                   1.6

In [2]:
sc.install_pypi_package("pandas")
sc.install_pypi_package("matplotlib")
sc.install_pypi_package("seaborn")
sc.install_pypi_package("scipy")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas
  Using cached https://files.pythonhosted.org/packages/bf/4c/cb7da76f3a5e077e545f9cf8575b8f488a4e8ad60490838f89c5cdd5bb57/pandas-1.1.4-cp37-cp37m-manylinux1_x86_64.whl
Collecting python-dateutil>=2.7.3 (from pandas)
  Using cached https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl
Installing collected packages: python-dateutil, pandas
Successfully installed pandas-1.1.4 python-dateutil-2.8.1

Collecting matplotlib
  Using cached https://files.pythonhosted.org/packages/30/f2/10c822cb0ca5ebec58bd1892187bc3e3db64a867ac26531c6204663fc218/matplotlib-3.3.3-cp37-cp37m-manylinux1_x86_64.whl
Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/8a/bb/488841f56197b13700afd5658fc279a2025a39e22449b7cf29864669b15d/pyparsing-2.4.7-py2.py3-none-any.whl
Collecting pillow>=6.2.0 (from matplotlib)
  Using cached https:

In [3]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
from time import time

from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

from pyspark.ml.stat import Correlation
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

import warnings
warnings.filterwarnings("ignore")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# load the dataset 
df = spark.read.json("s3n://udacity-dsnd/sparkify/sparkify_event_data.json")
# check the schema of the dataset
df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

In [5]:
print('The dataset has {} rows.'.format(df.count()))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The dataset has 26259199 rows.

In [6]:
df.select(min(to_timestamp(col('ts')/1000)).alias('Start time')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+
|         Start time|
+-------------------+
|2018-10-01 00:00:01|
+-------------------+

In [7]:
df.select(max(to_timestamp(col('ts')/1000)).alias('End time')).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+
|           End time|
+-------------------+
|2018-12-01 00:00:02|
+-------------------+

## Data Cleaning

In [8]:
def clean_data(df):
    """Clean a Sparkify dataset 
    
    Args:
    df: (spark dataframe) a Sparkify dataset
    
    Returns:
    df: (spark dataframe) a preprocessed Sparkify dataset
    """
    # remove records when 'location' is null
    df = df.filter(df['location'].isNotNull())
    
    # convert 'registration' and 'ts' to date format
    df = df \
        .withColumn('registrationTime', to_timestamp(col('registration')/1000)) \
        .withColumn('time', to_timestamp(col('ts')/1000)) 
    
    # replace location with first listed state 
    state_udf = udf(lambda x: x.split(', ')[1].split('-')[0])
    df = df.withColumn('location', state_udf('location'))
        
    return df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Data Labeling

In [9]:
def label_data(df, label='Churn'):
    """Add a label column to the Sparkify dataset 
    
    Args:
    df: (spark dataframe) a cleaned Sparkify dataset
    label: (string) label name
    
    Returns:
    df: (spark dataframe) a labeled Sparkify dataset
    """
    userWindow = Window.partitionBy('userId').orderBy('ts').rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)

    # label churned users to be 1 and unchurned users to be 0 
    df = df \
        .withColumn(label, when(col('page')=='Cancellation Confirmation', 1).otherwise(0)) \
        .withColumn(label, max(label).over(userWindow))
    
    return df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Feature Engineering

In [10]:
def build_features(df, label='Churn'):
    """Build features to be used for modeling
    
    Args:
    df: (spark dataframe) a cleaned and labeled Sparkify dataset
    label: (string) label name
    
    Returns:
    user_df: (spark dataframe) a labeled dataset with features of interest grouped by user ids
    """
    userWindow = Window.partitionBy('userId').orderBy('ts').rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)

    # find the user's geographical division based on the location
    path = "s3://aws-emr-resources-622727603602-us-west-2/notebooks/e-D5SFL69E3FTYPOEVUORGSLR3C/us_regions.csv"
    region_df = spark.read.csv(path, header=True)
    division_udf = udf(lambda x: 'location' + x.replace(' ', ''))
    
    location_df = df.select(['userId', 'location']).dropDuplicates(['userId'])
    location_df = location_df.join(region_df, location_df['location']==region_df["State Code"], how='left') \
        .select(['userId', col("Division").alias("location")]) \
        .withColumn('location', division_udf('location'))

    # one hot encode the 'location' column by pivoting it 
    location_df = location_df.groupBy('userId').pivot('location').agg(count('location').cast(IntegerType())).fillna(0)
    # remove the last column (or any one) to keep the binary columns independent
    location_df = location_df.drop(location_df.columns[-1]) 

    # find the latest level of each user
    df = df.withColumn('latestLevel', last(col('level')).over(userWindow))
    
    # calculate the duration between registration to last activity (in days)
    regist_duration_df = df.groupBy('userId') \
        .agg(((last(col('ts'))-last(col('registration')))/1000/3600/24).alias('registDuration'))

    # compute average session duration (in hours)
    avg_session_duration_df = df \
        .groupBy(['userId', 'sessionId']).agg(min(col('ts')).alias('session_start'), max(col('ts')).alias('session_end'))\
        .groupBy('userId').agg(avg((col('session_end') - col('session_start'))/1000/3600).alias('avgSessionDuration'))

    # find the latest level of each user
    df = df.withColumn('latestLevel', last(col('level')).over(userWindow))

    regist_duration_df = df.groupBy('userId') \
        .agg(((last(col('ts'))-last(col('registration')))/1000/3600/24).alias('registDuration'))

    # compute average session duration (in hours)
    avg_session_duration_df = df \
        .groupBy(['userId', 'sessionId']).agg(min(col('ts')).alias('session_start'), max(col('ts')).alias('session_end'))\
        .groupBy('userId').agg(avg((col('session_end') - col('session_start'))/1000/3600).alias('avgSessionDuration'))
    
    # define the default start and end of the observation period
    obs_start_default = df.select(min(col('ts'))).collect()[0][0]
    obs_end_default = df.select(max(col('ts'))).collect()[0][0]

    # compute the observation period
    df = df \
        .withColumn('obs_start', when(col('registration') > obs_start_default, first(col('ts')).over(userWindow)) \
                    .otherwise(obs_start_default)) \
        .withColumn('end_state', last(col('page')).over(userWindow)) \
        .withColumn('obs_end', when(col('end_state') == 'Cancellation Confirmation', last(col('ts')).over(userWindow)) \
                    .otherwise(obs_end_default)) \
        .withColumn('obsDays', (col('obs_end') - col('obs_start'))/1000/3600/24)

    # aggregate activity statistics
    user_df = df.groupBy('userId') \
        .agg(first(col(label)).alias(label), \
             first(when(col('gender') == 'M', 1).otherwise(0)).alias('gender'), \
             first(when(col('latestLevel') == 'paid', 1).otherwise(0)).alias('latestLevel'), \
             first(col('obsDays')).alias('obsDays'), \
             sum(when(col('page') == 'NextSong', 1).otherwise(0)).alias('nSongs'), \
             sum(when(col('page') == 'Thumbs Up', 1).otherwise(0)).alias('nThumbsUp'), \
             sum(when(col('page') == 'Thumbs Down', 1).otherwise(0)).alias('nThumbsDown'), \
             sum(when((col('page') == 'Upgrade') | (col('page') == 'Submit Upgrade'), 1).otherwise(0)).alias('nUpgrade'), \
             sum(when((col('page') == 'Downgrade') | (col('page') == 'Submit Downgrade'), 1).otherwise(0)).alias('nDowngrade'), \
             sum(when(col('page') == 'Add Friend', 1).otherwise(0)).alias('nAddFriend'), \
             sum(when(col('page') == 'Add to Playlist', 1).otherwise(0)).alias("nAddPlaylist"), \
             sum(when(col('page') == 'Roll Advert', 1).otherwise(0)).alias('nAdvert'), \
             sum(when((col('page') == 'Help'), 1).otherwise(0)).alias('nHelp'), \
             sum(when((col('page') == 'Error'), 1).otherwise(0)).alias('nError')) \
        .join(location_df, on='userId') \
        .join(regist_duration_df, on='userId') \
        .join(avg_session_duration_df, on='userId')

    user_df = user_df \
        .withColumn('avgDailySongs', col('nSongs') / col('obsDays')) \
        .withColumn('avgDailyThumbsUp', col('nThumbsUp') / col('obsDays')) \
        .withColumn('avgDailyThumbsDown', col('nThumbsDown') / col('obsDays')) \
        .withColumn('avgDailyUpgrade', col('nUpgrade') / col('obsDays')) \
        .withColumn('avgDailyDowngrade', col('nDowngrade') / col('obsDays')) \
        .withColumn('avgDailyAddFriend', col('nAddFriend') / col('obsDays')) \
        .withColumn('avgDailyAddPlaylist', col('nAddPlaylist') / col('obsDays')) \
        .withColumn('avgDailyAdvert', col('nAdvert') / col('obsDays')) \
        .withColumn('avgDailyHelp', col('nHelp') / col('obsDays')) \
        .withColumn('avgDailyError', col('nError') / col('obsDays')) \
        .drop('userId', 'obsDays', 'nSongs', 'nThumbsUp', 'nThumbsDown', 'nUpgrade', 'nDowngrade', \
              'nAddFriend', 'nAddPlaylist', 'nAdvert', 'nHelp', 'nError')
    
    return user_df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
def drop_multicollinear_features(user_df, label='Churn', threshold=0.85):
    """Drop highly correlated features to avoid multicollinearity
    
    Args:
    user_df: (spark dataframe) a labeled dataset with binary and numerical features
    label: (string) label name
    threshold: (float) the threshold of high correlation
    
    Returns:
    model_df: (spark dataframe) a labeled dataset after removal of multicollinear features
    """
    vec_col = 'corr_features'
   
    # assemble all vector columns into one vector column
    assembler = VectorAssembler(inputCols=user_df.columns, outputCol=vec_col)
    corr_df = assembler.transform(user_df).select(vec_col)

    # compute the correlation between 'churn' and every feature and the correlation between each pair of features
    corr_mat = Correlation.corr(corr_df, vec_col)
    # convert the corrlation matrix to a pandas dataframe with column names
    corr_values = corr_mat.collect()[0][0].values
    corr_mat_pd = pd.DataFrame(corr_values.reshape(-1, len(user_df.columns)), \
                           index=user_df.columns, columns=user_df.columns)
    
    # construct an adjacency matrix where high correlation is labeled as 1, otherwise 0
    is_high_corr = corr_mat_pd.values > threshold
    adj_mat = csr_matrix(is_high_corr.astype(int) - np.identity(len(user_df.columns)))

    # find groups of highly correlated features by finding the connected components in the adjacency matrix
    _, corr_labels = connected_components(csgraph=adj_mat, directed=False)
    unique, unique_counts = np.unique(corr_labels, return_counts=True)
    # get groups with size > 1
    high_corr_labels = unique[unique_counts > 1]

    # if there is at least one group of highly correlated features
    if len(high_corr_labels) > 0:
        # map the label indices of highly correlated features to their column names
        print('Highly correlated features include:')
        high_corr_col_dict = {}
        for high_corr_label in high_corr_labels:
            high_corr_col_dict[high_corr_label] = [col_name for corr_label, col_name in zip(corr_labels, user_df.columns) 
                                               if corr_label == high_corr_label]
            print(high_corr_col_dict[high_corr_label])
        
        print('\nFeatures to keep:')
        cols_to_drop = []
        for col_name_list in high_corr_col_dict.values(): 
            # keep the feature that has the highest correlation with the response variable
            col_to_keep = corr_mat_pd.loc[col_name_list, label].idxmax()
            print(col_to_keep)
            # remove the other features to avoid multicolinearity 
            col_name_list.remove(col_to_keep)
            corr_mat_pd.drop(index=col_name_list, columns=col_name_list, inplace=True)
            cols_to_drop.extend(col_name_list)
            
    model_df = user_df.drop(*cols_to_drop)
    
    return model_df

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Modeling and Evaluation

In [12]:
def split_train_test(model_df, fraction, label='Churn', seed=2020):
    """Split the dataset into a training and a test set using stratified sampling based on the label column
    
    Args:
    model_df: (spark dataframe) a labeled dataset with binary and numerical features
    fraction: (float) the fraction of the dataset used for training
    label: (string) label name
    seed: (int) a nonnegative integer 
    
    Returns:
    train: (spark dataframe) a training set
    test: (spark dataframe) a test set
    """
    train = model_df.sampleBy(label, fractions={0: fraction, 1: fraction}, seed=2020)
    test = model_df.subtract(train)
    
    return train, test

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
def print_metrics(pred, label='Churn'):
    """Print evaluation metrics on a test set
    
    Args:
    pred: (spark dataframe) a test set 
    
    Returns:
    summary: (pandas dataframe) a summary of evaluation metrics
    """
    eval_metrics = {}

    # compute area under PR curve
    evaluator = BinaryClassificationEvaluator(labelCol=label)
    auc_pr = evaluator.evaluate(pred, {evaluator.metricName:'areaUnderPR'})

    # compute precision, recall and f1 score
    predictionAndLabels = pred.select('prediction', label)
    # both 'prediction' and label in predictionAndLabels need to be cast to float type and 
    # map to tuple before calling 'MulticlassMetrics'
    metrics = MulticlassMetrics(predictionAndLabels.rdd.map(lambda x: tuple(map(float, x))))

    # get overall statistics
    eval_metrics['overall'] = [metrics.weightedPrecision, metrics.weightedRecall, \
                               metrics.weightedFMeasure(), auc_pr]

    # get statistics by class
    classes = [0.0, 1.0]
    for cls in classes:
        eval_metrics['class ' + str(int(cls))] = [metrics.precision(cls), metrics.recall(cls), \
                                                  metrics.fMeasure(cls), '']

    # convert to a pandas dataframe for display
    summary = pd.DataFrame.from_dict(eval_metrics, orient='index', \
                                     columns=['precision', 'recall', 'f1 score', 'AUC-PR'])   
    
    return summary

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
def build_pipeline(bin_cols, num_cols, label='Churn', seed=2020):
    """Build a pipeline using a random forest classifier for training
    
    Args:
    bin_cols: (list) a list of binary columns
    num_cols: (list) a list of numerical columns
    label: (string) label name
    seed: (int) a nonnegative integer 
    
    Returns:
    pipeline_rf: (Pipeline object) a sequence of stages with the last stage to be a random forest classifier 
    """
    # assemble numerical columns to a single vector column 
    num_assembler = VectorAssembler(inputCols=num_cols, outputCol='num_features')
    
    # scale each numberical feature within the range [0,1] 
    scaler = MinMaxScaler(inputCol='num_features', outputCol='scaled_features')
    
    # assemble all vector columns into one vector column
    assembler = VectorAssembler(inputCols=bin_cols + ['scaled_features'], outputCol='features')

    # random forest classifier
    rf = RandomForestClassifier(featuresCol='features', labelCol=label, seed=seed)
    pipeline_rf = Pipeline(stages=[num_assembler, scaler, assembler, rf])
    
    return pipeline_rf

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
def tune_rf(train, pipeline_rf, numTrees=[100, 200], maxDepth=[4, 5], label='Churn'):
    """Tune the hyperameters of the random forest classifier using grid search with cross validation
    
    Args:
    train: (spark dataframe) a training set
    pipeline_rf: (Pipeline object) a sequence of stages with the last stage to be a random forest classifier 
    numTrees: (list) number of trees
    maxDepth: (list) maximum tree depth
    
    Returns:
    cv_rf: (CrossValidator object) a cross validation model trained by the random forest classifier
    """
    # set hyperparameters for tuning
    paramGrid = ParamGridBuilder() \
                .addGrid(pipeline_rf.getStages()[-1].numTrees, numTrees) \
                .addGrid(pipeline_rf.getStages()[-1].maxDepth, maxDepth) \
                .build()  

    # grid search with cross validation    
    crossval_rf = CrossValidator(estimator = pipeline_rf,
                                 estimatorParamMaps = paramGrid,
                                 evaluator = BinaryClassificationEvaluator(labelCol=label, metricName='areaUnderPR'),
                                 numFolds = 4)

    start = time()
    cv_rf = crossval_rf.fit(train)
    end = time()
    print('Total training time for hyperparameter tuning on random forest classifier: {:.0f} seconds'.format(end - start))
    
    return cv_rf

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## End to End Workflow

In [16]:
df = clean_data(df)
df = label_data(df)
df.dropDuplicates(['userId']).groupby('Churn').count().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-----+
|Churn|count|
+-----+-----+
|    1| 5003|
|    0|17274|
+-----+-----+

In [17]:
user_df = build_features(df)
model_df = drop_multicollinear_features(user_df)
print('\nThe schema of the model for training:')
model_df.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Highly correlated features include:
['avgDailySongs', 'avgDailyThumbsUp', 'avgDailyAddPlaylist']

Features to keep:
avgDailySongs

The schema of the model for training:
root
 |-- Churn: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- latestLevel: integer (nullable = true)
 |-- locationEastNorthCentral: integer (nullable = true)
 |-- locationEastSouthCentral: integer (nullable = true)
 |-- locationMiddleAtlantic: integer (nullable = true)
 |-- locationMountain: integer (nullable = true)
 |-- locationNewEngland: integer (nullable = true)
 |-- locationPacific: integer (nullable = true)
 |-- locationSouthAtlantic: integer (nullable = true)
 |-- locationWestNorthCentral: integer (nullable = true)
 |-- registDuration: double (nullable = true)
 |-- avgSessionDuration: double (nullable = true)
 |-- avgDailySongs: double (nullable = true)
 |-- avgDailyThumbsDown: double (nullable = true)
 |-- avgDailyUpgrade: double (nullable = true)
 |-- avgDailyDowngrade: double (nullabl

In [18]:
train, test = split_train_test(model_df, fraction=0.8)
num_cols = [field.name for field in model_df.schema.fields if field.dataType != IntegerType()]
bin_cols = [col for col in model_df.columns if col not in num_cols + ['Churn']]
pipeline_rf = build_pipeline(bin_cols, num_cols)
cv_rf = tune_rf(train, pipeline_rf)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total training time for hyperparameter tuning on random forest classifier: 4743 seconds

In [19]:
# store grid search results in a dataframe
params = [{p.name: v for p, v in m.items()} for m in cv_rf.getEstimatorParamMaps()]
params_pd = pd.DataFrame(params)
params_pd['AUC-PR'] = cv_rf.avgMetrics
params_pd

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

   numTrees  maxDepth    AUC-PR
0       100         4  0.728003
1       100         5  0.751440
2       200         4  0.732818
3       200         5  0.754571

In [20]:
test_prediction = cv_rf.transform(test)
print_metrics(test_prediction)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

         precision    recall  f1 score    AUC-PR
overall   0.863558  0.866794  0.851038  0.730413
class 0   0.870685  0.976224  0.920439          
class 1   0.836863  0.456875  0.591065

In [24]:
features = bin_cols + num_cols
importances = list(cv_rf.bestModel.stages[-1].featureImportances)
feat_imp_pd = pd.DataFrame({'feature': features, 'importance': importances}).sort_values('importance', ascending = False)
feat_imp_pd

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                     feature  importance
17            avgDailyAdvert    0.231323
10            registDuration    0.206582
14           avgDailyUpgrade    0.154782
13        avgDailyThumbsDown    0.117627
15         avgDailyDowngrade    0.087892
19             avgDailyError    0.061178
12             avgDailySongs    0.053208
18              avgDailyHelp    0.039779
16         avgDailyAddFriend    0.036493
11        avgSessionDuration    0.007662
1                latestLevel    0.002504
9   locationWestNorthCentral    0.000180
5           locationMountain    0.000171
3   locationEastSouthCentral    0.000119
7            locationPacific    0.000105
4     locationMiddleAtlantic    0.000103
6         locationNewEngland    0.000091
0                     gender    0.000083
8      locationSouthAtlantic    0.000061
2   locationEastNorthCentral    0.000058