In [101]:
import sys
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext
import os
from pyspark.sql.functions import *
import pyspark.storagelevel 
import numpy as np
import pandas as pd
import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pdb
import subprocess # Used for executing linux commands, for writing to teradata
import sys

from pyspark.sql.types import *
from pyspark.sql.functions import col, udf, min, max
from datetime import datetime, timedelta

sqlContext = SQLContext(sc)
sqlContext = HiveContext(sc)

In [27]:
schema = StructType([StructField("state", StringType(), True),
                     StructField("account_length", DoubleType(), True),
                     StructField("area_code", StringType(), True),
                     StructField("phone_number", StringType(), True),
                     StructField("international_plan", StringType(), True),
                     StructField("voice_mail_plan", StringType(), True),
                     StructField("number_vmail_messages", DoubleType(), True),
                     StructField("total_day_minutes", DoubleType(), True),
                     StructField("total_day_calls", DoubleType(), True),
                     StructField("total_day_charge", DoubleType(), True),
                     StructField("total_eve_minutes", DoubleType(), True),
                     StructField("total_eve_calls", DoubleType(), True),
                     StructField("total_eve_charge", DoubleType(), True),
                     StructField("total_night_minutes", DoubleType(), True),
                     StructField("total_night_calls", DoubleType(), True),
                     StructField("total_night_charge", DoubleType(), True),
                     StructField("total_intl_minutes", DoubleType(), True),
                     StructField("total_intl_calls", DoubleType(), True),
                     StructField("total_intl_charge", DoubleType(), True),
                     StructField("number_customer_service_calls", DoubleType(), True),
                     StructField("churned", StringType(), True)])

In [28]:
df = sqlContext.read.format('com.databricks.spark.csv').load('file:///home/vijay/DATA_SCIENCE/ds-for-telco-master/data/churn.all',schema=schema)
df.show(3)

+-----+--------------+---------+------------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+----------------+-----------------+-----------------------------+-------+
|state|account_length|area_code|phone_number|international_plan|voice_mail_plan|number_vmail_messages|total_day_minutes|total_day_calls|total_day_charge|total_eve_minutes|total_eve_calls|total_eve_charge|total_night_minutes|total_night_calls|total_night_charge|total_intl_minutes|total_intl_calls|total_intl_charge|number_customer_service_calls|churned|
+-----+--------------+---------+------------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+------------

In [14]:
#what is VectorAssembler?
#A feature transformer that merges multiple columns into a vector column.
df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"])
print(df.show())
vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")
vecAssembler.transform(df).head().features

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  0|  3|
+---+---+---+

None


DenseVector([1.0, 0.0, 3.0])

In [149]:
# Assemble feature vectors
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=[
        'number_customer_service_calls', \
        'total_night_minutes', \
        'total_day_minutes', \
        'total_eve_minutes', \
        'account_length'],
        outputCol = 'features')

In [150]:
type(assembler)

pyspark.ml.feature.VectorAssembler

In [151]:
assembler.transform(df).take(1)

[Row(state=u'KS', account_length=128.0, area_code=u' 415', phone_number=u' 382-4657', international_plan=u' no', voice_mail_plan=u' yes', number_vmail_messages=25.0, total_day_minutes=265.1, total_day_calls=110.0, total_day_charge=45.07, total_eve_minutes=197.4, total_eve_calls=99.0, total_eve_charge=16.78, total_night_minutes=244.7, total_night_calls=91.0, total_night_charge=11.01, total_intl_minutes=10.0, total_intl_calls=3.0, total_intl_charge=2.7, number_customer_service_calls=1.0, churned=u' False.', features=DenseVector([1.0, 244.7, 265.1, 197.4, 128.0]))]

In [None]:
# feature vector is series of numbers containing, one row many columns  

In [152]:
# Transform labels
from pyspark.ml.feature import StringIndexer
label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')

In [153]:
# Fit the model
#from pyspark.ml import Pipeline?
#label_indexer.getParam('label')
indexed = label_indexer.fit(df).transform(df)
indexed.take(1)

[Row(state=u'KS', account_length=128.0, area_code=u' 415', phone_number=u' 382-4657', international_plan=u' no', voice_mail_plan=u' yes', number_vmail_messages=25.0, total_day_minutes=265.1, total_day_calls=110.0, total_day_charge=45.07, total_eve_minutes=197.4, total_eve_calls=99.0, total_eve_charge=16.78, total_night_minutes=244.7, total_night_calls=91.0, total_night_charge=11.01, total_intl_minutes=10.0, total_intl_calls=3.0, total_intl_charge=2.7, number_customer_service_calls=1.0, churned=u' False.', label=0.0)]

In [154]:
from pyspark.ml import Pipeline

Init signature: Pipeline(*args, **kwargs)
Docstring:     
A simple pipeline, which acts as an estimator. A Pipeline consists
of a sequence of stages, each of which is either an
:py:class:`Estimator` or a :py:class:`Transformer`. When
:py:meth:`Pipeline.fit` is called, the stages are executed in
order. If a stage is an :py:class:`Estimator`, its
:py:meth:`Estimator.fit` method will be called on the input
dataset to fit a model. Then the model, which is a transformer,
will be used to transform the dataset as the input to the next
stage. If a stage is a :py:class:`Transformer`, its
:py:meth:`Transformer.transform` method will be called to produce
the dataset for the next stage. The fitted model from a
:py:class:`Pipeline` is a :py:class:`PipelineModel`, which
consists of fitted models and transformers, corresponding to the
pipeline stages. If stages is an empty list, the pipeline acts as an
identity transformer.

In [155]:
from pyspark.ml.classification import RandomForestClassifier


In [156]:
classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')

In [157]:
type(classifier)

pyspark.ml.classification.RandomForestClassifier

In [158]:
pipeline = Pipeline(stages=[assembler, label_indexer, classifier])

In [159]:
(train, test) = df.randomSplit([0.6, 0.4])
model = pipeline.fit(train)

In [160]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [161]:
predictions = model.transform(train)
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.8644847307304276

In [162]:
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.7934181226447432

In [163]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s." % (auroc, aupr)

'The AUROC is 0.793418122645 and the AUPR is 0.609153765682.'

In [180]:
# REDO
# Assemble feature vectors


from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols = [
        'number_customer_service_calls', \
        'total_night_minutes', \
        'total_day_minutes', \
        'total_eve_minutes', \
        'account_length'],
    outputCol = 'churnFeatures')

# Transform labels
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')
# Fit the model
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'churnFeatures')

pipeline = Pipeline(stages=[assembler, label_indexer, classifier])

(train, test) = df.randomSplit([0.7, 0.3])
model = pipeline.fit(train)


from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(train)
print(train.take(1))
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)


[Row(state=u'AK', account_length=1.0, area_code=u' 408', phone_number=u' 373-1028', international_plan=u' no', voice_mail_plan=u' no', number_vmail_messages=0.0, total_day_minutes=175.2, total_day_calls=74.0, total_day_charge=29.78, total_eve_minutes=151.7, total_eve_calls=79.0, total_eve_charge=12.89, total_night_minutes=230.5, total_night_calls=109.0, total_night_charge=10.37, total_intl_minutes=5.3, total_intl_calls=3.0, total_intl_charge=1.43, number_customer_service_calls=1.0, churned=u' False.')]


0.861258790887504

In [176]:
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s." % (auroc, aupr)

'The AUROC is 0.798443360551 and the AUPR is 0.61261371153.'

In [202]:
predictions.take(2)

[Row(state=u'IN', account_length=65, area_code=415, phone_number=-6274, international_plan=u'no', voice_mail_plan=u'no', number_vmail_messages=0, total_day_minutes=129.1, total_day_calls=137, total_day_charge=21.95, total_eve_minutes=228.5, total_eve_calls=83, total_eve_charge=19.42, total_night_minutes=208.8, total_night_calls=111, total_night_charge=9.4, total_intl_minutes=12.7, total_intl_calls=6, total_intl_charge=3.43, number_customer_service_calls=4, churnFeatures=DenseVector([4.0, 208.8, 129.1, 228.5, 65.0]), rawPrediction=DenseVector([2.1381, 17.8619]), probability=DenseVector([0.1069, 0.8931]), prediction=1.0)]

In [203]:
#df.columns
test.take(2)

[Row(state=u'AK', account_length=36.0, area_code=u' 408', phone_number=u' 341-9764', international_plan=u' no', voice_mail_plan=u' yes', number_vmail_messages=30.0, total_day_minutes=146.3, total_day_calls=128.0, total_day_charge=24.87, total_eve_minutes=162.5, total_eve_calls=80.0, total_eve_charge=13.81, total_night_minutes=129.3, total_night_calls=109.0, total_night_charge=5.82, total_intl_minutes=14.5, total_intl_calls=6.0, total_intl_charge=3.92, number_customer_service_calls=0.0, churned=u' False.'),
 Row(state=u'AK', account_length=36.0, area_code=u' 415', phone_number=u' 399-1526', international_plan=u' yes', voice_mail_plan=u' yes', number_vmail_messages=19.0, total_day_minutes=171.9, total_day_calls=96.0, total_day_charge=29.22, total_eve_minutes=198.4, total_eve_calls=111.0, total_eve_charge=16.86, total_night_minutes=321.7, total_night_calls=76.0, total_night_charge=14.48, total_intl_minutes=10.5, total_intl_calls=1.0, total_intl_charge=2.84, number_customer_service_calls=1

In [None]:
tdf = sqlContext.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "feature"])


In [200]:
tdf1 = sqlContext.createDataFrame([("IN", 65, 415, 329-6603, "no", "no", 0, 129.1, 137, 21.95, 228.5, 83, 19.42, 208.8, 111, 9.4, 12.7, 6, 3.43, 4)
                                  ],['state',
 'account_length',
 'area_code',
 'phone_number',
 'international_plan',
 'voice_mail_plan',
 'number_vmail_messages',
 'total_day_minutes',
 'total_day_calls',
 'total_day_charge',
 'total_eve_minutes',
 'total_eve_calls',
 'total_eve_charge',
 'total_night_minutes',
 'total_night_calls',
 'total_night_charge',
 'total_intl_minutes',
 'total_intl_calls',
 'total_intl_charge',
 'number_customer_service_calls'])
# notice there is no label

In [196]:
tdf2 = sqlContext.createDataFrame([("RI", 74, 415, 344-9403, "no", "no", 0, 187.7, 127, 31.91, 163.4, 148, 13.89, 196, 94, 8.82, 9.1, 5, 2.46, 0)
                                  ],['state',
 'account_length',
 'area_code',
 'phone_number',
 'international_plan',
 'voice_mail_plan',
 'number_vmail_messages',
 'total_day_minutes',
 'total_day_calls',
 'total_day_charge',
 'total_eve_minutes',
 'total_eve_calls',
 'total_eve_charge',
 'total_night_minutes',
 'total_night_calls',
 'total_night_charge',
 'total_intl_minutes',
 'total_intl_calls',
 'total_intl_charge',
 'number_customer_service_calls'])


In [197]:
predictions = model.transform(tdf2)
evaluator = BinaryClassificationEvaluator()
#auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
#aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})
#"The AUROC is %s and the AUPR is %s." % (auroc, aupr)

In [198]:
type(predictions)

pyspark.sql.dataframe.DataFrame

In [199]:
predictions.take(2)

[Row(state=u'RI', account_length=74, area_code=415, phone_number=-9059, international_plan=u'no', voice_mail_plan=u'no', number_vmail_messages=0, total_day_minutes=187.7, total_day_calls=127, total_day_charge=31.91, total_eve_minutes=163.4, total_eve_calls=148, total_eve_charge=13.89, total_night_minutes=196, total_night_calls=94, total_night_charge=8.82, total_intl_minutes=9.1, total_intl_calls=5, total_intl_charge=2.46, number_customer_service_calls=0, churnFeatures=DenseVector([0.0, 196.0, 187.7, 163.4, 74.0]), rawPrediction=DenseVector([18.968, 1.032]), probability=DenseVector([0.9484, 0.0516]), prediction=0.0)]

In [201]:
predictions = model.transform(tdf1)
evaluator = BinaryClassificationEvaluator()
predictions.take(2)

[Row(state=u'IN', account_length=65, area_code=415, phone_number=-6274, international_plan=u'no', voice_mail_plan=u'no', number_vmail_messages=0, total_day_minutes=129.1, total_day_calls=137, total_day_charge=21.95, total_eve_minutes=228.5, total_eve_calls=83, total_eve_charge=19.42, total_night_minutes=208.8, total_night_calls=111, total_night_charge=9.4, total_intl_minutes=12.7, total_intl_calls=6, total_intl_charge=3.43, number_customer_service_calls=4, churnFeatures=DenseVector([4.0, 208.8, 129.1, 228.5, 65.0]), rawPrediction=DenseVector([2.1381, 17.8619]), probability=DenseVector([0.1069, 0.8931]), prediction=1.0)]

In [204]:
#IN, 65, 415, 329-6603, no, no, 0, 129.1, 137, 21.95, 228.5, 83, 19.42, 208.8, 111, 9.4, 12.7, 6, 3.43, 4, True.
#RI, 74, 415, 344-9403, no, no, 0, 187.7, 127, 31.91, 163.4, 148, 13.89, 196, 94, 8.82, 9.1, 5, 2.46, 0, False.

In [None]:
# AWESOME STATS WAY


In [205]:
churn_df = df

In [206]:
churn_df.show(2)

+-----+--------------+---------+------------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+----------------+-----------------+-----------------------------+-------+
|state|account_length|area_code|phone_number|international_plan|voice_mail_plan|number_vmail_messages|total_day_minutes|total_day_calls|total_day_charge|total_eve_minutes|total_eve_calls|total_eve_charge|total_night_minutes|total_night_calls|total_night_charge|total_intl_minutes|total_intl_calls|total_intl_charge|number_customer_service_calls|churned|
+-----+--------------+---------+------------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+------------

In [208]:
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

In [209]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [210]:
# this function handles categorical data

In [213]:
def create_category_vars(dataset,field_name): # passing full df and one field
    idx_col = field_name + "Index"
    col_vec = field_name + "Vec"
    
    month_stringIndexer = StringIndexer(inputCol=field_name,
                                       outputCol = idx_col)
    month_model = month_stringIndexer.fit(dataset)
    month_indexed = month_model.transform(dataset)
    
    month_encoder = OneHotEncoder(dropLast=True,
                                 inputCol=idx_col,
                                 outputCol = col_vec)
    return month_encoder.transform(month_indexed)

In [214]:
churn_df.columns

['state',
 'account_length',
 'area_code',
 'phone_number',
 'international_plan',
 'voice_mail_plan',
 'number_vmail_messages',
 'total_day_minutes',
 'total_day_calls',
 'total_day_charge',
 'total_eve_minutes',
 'total_eve_calls',
 'total_eve_charge',
 'total_night_minutes',
 'total_night_calls',
 'total_night_charge',
 'total_intl_minutes',
 'total_intl_calls',
 'total_intl_charge',
 'number_customer_service_calls',
 'churned']

In [225]:
churn_df_subset = churn_df.select('account_length',
                                'international_plan',
                                'voice_mail_plan',
                                'number_vmail_messages',
                                'total_day_minutes',
                                'total_eve_minutes',
                                'total_night_minutes',
                                'total_intl_minutes',
                                'number_customer_service_calls',
                                'churned')

In [226]:
churn_df_subset.columns

['account_length',
 'international_plan',
 'voice_mail_plan',
 'number_vmail_messages',
 'total_day_minutes',
 'total_eve_minutes',
 'total_night_minutes',
 'total_intl_minutes',
 'number_customer_service_calls',
 'churned']

In [227]:
# TRANSFORM THE LABELS OR TARGET VARIABLE
# EASY WAY TO CONVERT TRUE FALSE TO 0 & 1
churn_df_subset = churn_df_subset                       \
  .withColumn( "churn",
              (churn_df_subset['churned']== 'True.').cast('integer'))

churn_df_subset = churn_df_subset.drop('churned')
churn_df_subset.take(1)

[Row(account_length=128.0, international_plan=u' no', voice_mail_plan=u' yes', number_vmail_messages=25.0, total_day_minutes=265.1, total_eve_minutes=197.4, total_night_minutes=244.7, total_intl_minutes=10.0, number_customer_service_calls=1.0, churn=0)]

In [228]:
#Create categorical Variables
churn_vec = create_category_vars( churn_df_subset,
                               "international_plan" )

In [238]:
churn_vec.select(['churn','international_planIndex','international_planVec']).show(2)

+-----+-----------------------+---------------------+
|churn|international_planIndex|international_planVec|
+-----+-----------------------+---------------------+
|    0|                    0.0|        (1,[0],[1.0])|
|    0|                    0.0|        (1,[0],[1.0])|
+-----+-----------------------+---------------------+
only showing top 2 rows



In [239]:
churn_vec = create_category_vars( churn_vec,
                               "voice_mail_plan" )

In [241]:
churn_vec.select(['churn','international_planIndex','international_planVec','voice_mail_planIndex','voice_mail_planVec']).show(2)

+-----+-----------------------+---------------------+--------------------+------------------+
|churn|international_planIndex|international_planVec|voice_mail_planIndex|voice_mail_planVec|
+-----+-----------------------+---------------------+--------------------+------------------+
|    0|                    0.0|        (1,[0],[1.0])|                 1.0|         (1,[],[])|
|    0|                    0.0|        (1,[0],[1.0])|                 1.0|         (1,[],[])|
+-----+-----------------------+---------------------+--------------------+------------------+
only showing top 2 rows



In [243]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
def parseNewPoint( rec ):
    return LabeledPoint( float( rec[9] ),
          Vectors.dense(tuple( [rec[0],
                                rec[3],
                                rec[4],
                                rec[5],
                                rec[6],
                                rec[7],
                                rec[8] ] +
                                rec[11].toArray().tolist() +
                                rec[13].toArray().tolist() ) ) )

In [245]:
churn_vec_new = churn_vec.rdd.map( lambda rec:  # map is only available in rdd
                            parseNewPoint( rec ) )

In [247]:
churn_vec_new.take(2)

[LabeledPoint(0.0, [128.0,25.0,265.1,197.4,244.7,10.0,1.0,1.0,0.0]),
 LabeledPoint(0.0, [107.0,26.0,161.6,195.5,254.4,13.7,1.0,1.0,0.0])]

## Logistic Regression Model

In [248]:
trainingData, testData = churn_vec_new.randomSplit([0.7, 0.3])

In [249]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS,    \
                                       LogisticRegressionModel
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [250]:
lr_model = LogisticRegressionWithLBFGS.train( trainingData )

In [271]:
type(labelsAndPreds_lr)

pyspark.rdd.PipelinedRDD

In [268]:
labelsAndPreds_lr = testData.map(lambda lp:
                               ( float(lr_model.predict(lp.features) ), lp.label ) )

success_count_lr = labelsAndPreds_lr.filter(lambda rec:
                                            rec[0] == rec[1]).count()
print("Successful prediction percentage: " +
    str( round( success_count_lr / labelsAndPreds_lr.count(), 2 ) ) )

Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.col. Trace:
py4j.Py4JException: Method col([class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)



In [264]:
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics( labelsAndPreds_lr )

In [258]:
type(metrics)

pyspark.mllib.evaluation.MulticlassMetrics

In [266]:


def printMetric( metrics ):
    print( 'Precision of True ', metrics.precision(1) )

    print( 'Precision of False', metrics.precision(0) )
    print( 'Recall of True    ', metrics.recall(1) )
    print( 'Recall of False   ', metrics.recall(0) )
    print( 'F-1 Score         ', metrics.fMeasure() )
    print( 'Confusion Matrix\n', metrics.confusionMatrix().toArray() )



In [267]:
printMetric( metrics )

Py4JJavaError: An error occurred while calling o3402.precision.
: java.util.NoSuchElementException: key not found: 1.0
	at scala.collection.MapLike$class.default(MapLike.scala:228)
	at scala.collection.AbstractMap.default(Map.scala:59)
	at scala.collection.mutable.HashMap.apply(HashMap.scala:65)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.precision(MulticlassMetrics.scala:105)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
