## Preparing Code

In [1]:
from pyspark import SparkContext
from pyspark.storagelevel import StorageLevel
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from udf_open_payments import *
from csv import reader
from tqdm import tqdm_notebook

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

NameError: name 'F' is not defined

# Functions

In [9]:
def split_comma(x):
    return list(reader([x], delimiter=',', quotechar='"'))[0]

#################### UDF ########################

def is_na(x):
    """
    To be wrapped with udf.
    Returns 1 if the element in the column is null and 0 otherwise. 
    """
    if x is not None:
        return 1
    else:
        return 0

isNa = F.udf(is_na)


def safe_double(x):
    try:
        return float(x)
    except:
        return x

safeDouble = F.udf(safe_double)
    
def safe_int(x):
    try:
        return int(x)
    except:
        return x
    
safeInt = F.udf(safe_int)
    
def str_to_bool(x):
    """
    To be wrapped with udf.
    Applied to a colum, maps yes/no and transform them to 1/0 respectively. Nones mapped to 0.
    """
    try:
        if x.lower()=='yes':
            return 1
        else:
            return 0
    except:
        return 0

strToBool = F.udf(str_to_bool)

def max_to_one(x):
    """
    To be wrapped with udf.
    Applied to a colum, maps yes/no and transform them to 1/0 respectively. Nones mapped to 0.
    """
    try:
        if x>1:
            return 1
        else:
            return x
    except:
        return x

maxToOne = F.udf(max_to_one)

def replace_na(x):
    """
    To be wrapped with udf.
    Applied to a colum, maps yes/no and transform them to 1/0 respectively. Nones mapped to 0.
    """
    if x is None:
        return 'blank'
    else:
        return x

replaceNA = F.udf(replace_na)

def check_blanks(x):
    try:
        if len(x)==0 or x is None:
            return 'blank'
        else: return x
    except:
        return x
    
checkBlankUdf = F.udf(check_blanks)

def date_to_day(x):
    """
    To be wrapped with udf.
    Extracts the day from a date in the format MM/DD/YYYY as an int.
    """
    try:
        v = x.split('/')
        return int(v[1])
    except:
        return x

dateToDay = F.udf(date_to_day)

def date_to_month(x):
    """
    To be wrapped with udf.
    Extracts the month from a date in the format MM/DD/YYYY as an int.
    """
    try:
        v = x.split('/')
        return int(v[0])
    except:
        return x

dateToMonth = F.udf(date_to_month)

def to_buckets_p(x, limits=[35.71, 112.22, 325.0]):
    """
    To be wrapped with udf.
    For Phisicians
    """
    for i,l in enumerate(limits):
        if x<l:
            return i
        else:
            return len(limits)

toBucketsP = F.udf(to_buckets_p)

def to_buckets_h(x, limits=[1866.64, 7170.82, 29388.13]):
    """
    To be wrapped with udf.
    For Hospitals
    """
    for i,l in enumerate(limits):
        if x<l:
            return i
        else:
            return len(limits)

toBucketsH = F.udf(to_buckets_h)

############ GENERAL USE ##############

def showMode(df,cols):
    cols = ['Month','Day','Program_Year']
    for col in cols:
        df.groupBy(col).count().orderBy('count',ascending=False).show()



############ DF COLUMNS FUNCTIONS #################

def TransformColumn(df, cols, userDefinedFunction, newCol=None):
    """
    Gets columns `cols` of `df` and applies a `userDefinedFunction`.
    
    If newCol dictionary is specified:
            - Won't drop cols and will add the new ones with the new name 
                Format ex:
                
                    {'oldCol1':'newCol1','oldCol2':'newCol2',..,'oldColN':'newColN'} (N cols given)
    
    If newCol dictionary is not specified:
            - Will drop the old columns and substitute them by the result of the transformation.

    """
    dfNew = df
    if newCol is None:
            
        for i, col in enumerate(cols):
            dfNew = dfNew.withColumn('%s_idx'%col, userDefinedFunction(dfNew[col]))\
                         .drop(col)\
                         .withColumnRenamed('%s_idx'%col, col)
    else:
        
        for i, col in enumerate(cols):
            dfNew = dfNew.withColumn(newCol[col], userDefinedFunction(dfNew[col]))

    return dfNew
        
def indexStringColumns(df, cols):
    """
    Modified from ex2 of Lesson5
    """
    #variable newdf will be updated several times
    newdf = df
    
    labels_mapping = {}
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        sm = StringIndexer(inputCol=c, outputCol=c+"-num").setHandleInvalid("keep")\
                .fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        labels_mapping[c] = sm.labels
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf, labels_mapping

def oneHotEncodeColumns(df, cols, dropLast=False):
    """
    Taken from ex2 of Lesson5
    """
    newdf = df
    for c in cols:
        print(c)
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=dropLast)
        
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)\

    return newdf

def permuteEntries(df):

    cols = df.columns
    dfnew = df
    
    print('Permuting column elemements...')
    for col in tqdm_notebook(cols):
        dfnew = df.withColumn('aux', df.select(col).orderBy(F.rand())[col],)\
                    .drop(col)\
                    .withColumnRenamed('aux',col)
    return dfnew

# Loading Data

In [4]:
input_file = "s3://msds697-openpayments/research_combined_filtered_reduced/part-00000-bc066200-ab7f-452f-ad34-865ce538cca5-c000.csv"
rdd = sc.textFile(input_file)

columns_str = rdd.take(1)[0]
columns = rdd.map(split_comma).take(1)

df = rdd.filter(lambda x: x!=columns_str)\
                  .map(split_comma)\
                  .toDF(columns[0])

# df.cache()
# df.show(1)

# Feature Selection (depending on task)

In [6]:
col_anomaly_detection = [
    'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country',
     'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State',
#      'Change_Type',
     'Charity_Indicator',
     'City_of_Travel',
#      'ClinicalTrials_Gov_Identifier',
     'Country_of_Travel',
     'Covered_Recipient_Type',
     'Covered_or_Noncovered_Indicator_1',
     'Covered_or_Noncovered_Indicator_2',
     'Covered_or_Noncovered_Indicator_3',
     'Covered_or_Noncovered_Indicator_4',
     'Covered_or_Noncovered_Indicator_5',
     'Date_of_Payment',  # To be converted into 2 columns of month, day
     'Expenditure_Category1',
     'Expenditure_Category2',
     'Expenditure_Category3',
     'Expenditure_Category4',
     'Expenditure_Category5',
     'Expenditure_Category6',
     'Form_of_Payment_or_Transfer_of_Value',
     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1',
     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2',
     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3',
     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4',
     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5',
     'Nature_of_Payment_or_Transfer_of_Value',
     'Number_of_Payments_Included_in_Total_Amount',
     'Physician_License_State_code1',
     'Physician_Ownership_Indicator',
     'Physician_Primary_Type',
#      'Physician_Profile_ID',
     'Physician_Specialty',
     'Preclinical_Research_Indicator',
     'Principal_Investigator_1_City',
     'Principal_Investigator_1_Country',
     'Principal_Investigator_1_License_State_code1',
     'Principal_Investigator_1_Postal_Code',
     'Principal_Investigator_1_Primary_Type',
#      'Principal_Investigator_1_Profile_ID',
     'Principal_Investigator_1_Province',
     'Principal_Investigator_1_Specialty',
     'Principal_Investigator_1_State',
     'Principal_Investigator_1_Zip_Code',
     'Principal_Investigator_2_City',
     'Principal_Investigator_2_Country',
     'Principal_Investigator_2_License_State_code1',
     'Principal_Investigator_2_Postal_Code',
     'Principal_Investigator_2_Primary_Type',
#      'Principal_Investigator_2_Profile_ID',
     'Principal_Investigator_2_Province',
     'Principal_Investigator_2_Specialty',
     'Principal_Investigator_2_State',
     'Principal_Investigator_2_Zip_Code',
     'Principal_Investigator_3_City',
     'Principal_Investigator_3_Country',
     'Principal_Investigator_3_License_State_code1',
     'Principal_Investigator_3_Postal_Code',
     'Principal_Investigator_3_Primary_Type',
#      'Principal_Investigator_3_Profile_ID',
     'Principal_Investigator_3_Province',
     'Principal_Investigator_3_Specialty',
     'Principal_Investigator_3_State',
     'Principal_Investigator_3_Zip_Code',
     'Principal_Investigator_4_City',
     'Principal_Investigator_4_Country',
     'Principal_Investigator_4_License_State_code1',
     'Principal_Investigator_4_Postal_Code',
     'Principal_Investigator_4_Primary_Type',
#      'Principal_Investigator_4_Profile_ID',
     'Principal_Investigator_4_Province',
     'Principal_Investigator_4_Specialty',
     'Principal_Investigator_4_State',
     'Principal_Investigator_4_Zip_Code',
     'Principal_Investigator_5_City',
     'Principal_Investigator_5_Country',
     'Principal_Investigator_5_License_State_code1',
     'Principal_Investigator_5_Postal_Code',
     'Principal_Investigator_5_Primary_Type',
#      'Principal_Investigator_5_Profile_ID',
     'Principal_Investigator_5_Province',
     'Principal_Investigator_5_Specialty',
     'Principal_Investigator_5_State',
     'Principal_Investigator_5_Zip_Code',
     'Product_Category_or_Therapeutic_Area_1',
     'Product_Category_or_Therapeutic_Area_2',
     'Product_Category_or_Therapeutic_Area_3',
     'Product_Category_or_Therapeutic_Area_4',
     'Product_Category_or_Therapeutic_Area_5',
     'Product_Indicator',
     'Program_Year',
     'Recipient_City',
     'Recipient_Country',
     'Recipient_State',
     'Recipient_Zip_Code',
     'Related_Product_Indicator',
     'State_of_Travel',
#      'Teaching_Hospital_ID',
     'Third_Party_Equals_Covered_Recipient_Indicator',
     'Third_Party_Payment_Recipient_Indicator',
     'Total_Amount_of_Payment_USDollars',
#      '_id',
     'type'
]


keep_col = col_anomaly_detection # for anomaly

# Pending: Contextual_Information
df1 = df.select(*(keep_col))\
        .filter(df.Total_Amount_of_Payment_USDollars!=0.0)

# df1.printSchema()

# Processing for anomaly

Given that our data has categorical values we can't directly apply K-means.

In [10]:
######## Reformat Date - to month/day
df2 = TransformColumn(df1, 
                      ['Date_of_Payment'], 
                      dateToDay,
                      {'Date_of_Payment':'Day'})

df3 = TransformColumn(df2, 
                      ['Date_of_Payment'], 
                      dateToMonth,
                      {'Date_of_Payment':'Month'}).drop('Date_of_Payment')

######### Reformat Date - to integers

str2int_cols = ['Month','Day','Program_Year','Number_of_Payments_Included_in_Total_Amount']

df4 = TransformColumn(df3,
                     str2int_cols,
                     safeInt)

######### Na's and blancks to str:

check_blanks_cols = df4.columns

df5 = TransformColumn(df4,
                     check_blanks_cols,
                     checkBlankUdf)

######### Create Non-Anomalies/Anomalies and Labels:

df6_not_permuted = df5.withColumn('label', F.lit(0))
df6_permuted = permuteEntries(df5).withColumn('label', F.lit(1))

######### Concat Anomalies/Non-Anomalies:
                      
df7 = df6_permuted.union(df6_not_permuted)

Permuting column elemements...


HBox(children=(IntProgress(value=0, max=94), HTML(value=u'')))




In [14]:
drop_col='Charity_Indicator,City_of_Travel,Country_of_Travel,Covered_or_Noncovered_Indicator_1,\
Covered_or_Noncovered_Indicator_2,Covered_or_Noncovered_Indicator_3,Covered_or_Noncovered_Indicator_4,\
Covered_or_Noncovered_Indicator_5,Expenditure_Category6,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1,\
Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3,\
Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5,\
Nature_of_Payment_or_Transfer_of_Value,Physician_Ownership_Indicator,Principal_Investigator_1_City,\
Principal_Investigator_1_Postal_Code,Principal_Investigator_1_Province,Principal_Investigator_1_Zip_Code,\
Principal_Investigator_2_City,Principal_Investigator_2_Zip_Code,Principal_Investigator_4_Postal_Code,\
Principal_Investigator_5_Postal_Code,Principal_Investigator_5_Province,Product_Category_or_Therapeutic_Area_1,\
Product_Category_or_Therapeutic_Area_2,Product_Category_or_Therapeutic_Area_3,Product_Category_or_Therapeutic_Area_4,\
Product_Category_or_Therapeutic_Area_5,Related_Product_Indicator,State_of_Travel,\
Third_Party_Equals_Covered_Recipient_Indicator,type,Number_of_Payments_Included_in_Total_Amount,\
Recipient_Zip_Code'.split(',')
feature_col = [col for col in df7.columns if col != 'label' and col not in drop_col]

va = VectorAssembler(inputCols=feature_col, outputCol='features')

In [15]:
%%time
df8, labels_mapping = indexStringColumns(df=df7.drop(*drop_col), cols=[col for col in feature_col if col not in drop_col])

CPU times: user 1.72 s, sys: 167 ms, total: 1.89 s
Wall time: 19min 16s


In [16]:
df9 = oneHotEncodeColumns(df=df8, cols=[col for col in feature_col if col not in drop_col])

Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country
Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State
Covered_Recipient_Type
Expenditure_Category1
Expenditure_Category2
Expenditure_Category3
Expenditure_Category4
Expenditure_Category5
Form_of_Payment_or_Transfer_of_Value
Physician_License_State_code1
Physician_Primary_Type
Physician_Specialty
Preclinical_Research_Indicator
Principal_Investigator_1_Country
Principal_Investigator_1_License_State_code1
Principal_Investigator_1_Primary_Type
Principal_Investigator_1_Specialty
Principal_Investigator_1_State
Principal_Investigator_2_Country
Principal_Investigator_2_License_State_code1
Principal_Investigator_2_Postal_Code
Principal_Investigator_2_Primary_Type
Principal_Investigator_2_Province
Principal_Investigator_2_Specialty
Principal_Investigator_2_State
Principal_Investigator_3_City
Principal_Investigator_3_Country
Principal_Investigator_3_License_State_code1
Principal_Investigator_3_Postal_Code
Principal_Investig

In [17]:
df10 = va.transform(df9)

In [18]:
df10.persist(StorageLevel.DISK_ONLY)

DataFrame[label: int, Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country: vector, Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State: vector, Covered_Recipient_Type: vector, Expenditure_Category1: vector, Expenditure_Category2: vector, Expenditure_Category3: vector, Expenditure_Category4: vector, Expenditure_Category5: vector, Form_of_Payment_or_Transfer_of_Value: vector, Physician_License_State_code1: vector, Physician_Primary_Type: vector, Physician_Specialty: vector, Preclinical_Research_Indicator: vector, Principal_Investigator_1_Country: vector, Principal_Investigator_1_License_State_code1: vector, Principal_Investigator_1_Primary_Type: vector, Principal_Investigator_1_Specialty: vector, Principal_Investigator_1_State: vector, Principal_Investigator_2_Country: vector, Principal_Investigator_2_License_State_code1: vector, Principal_Investigator_2_Postal_Code: vector, Principal_Investigator_2_Primary_Type: vector, Principal_Investigator_2_Province: vector, 

In [19]:
df_analyse = df10.filter('label != 1').drop('label')

In [23]:
rf = RandomForestClassifier(labelCol='label', featuresCol="features", 
                            numTrees=10, maxDepth= 10) 

In [24]:
rfm = rf.fit(df10)

Py4JJavaError: An error occurred while calling o5138.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 26 in stage 123.0 failed 4 times, most recent failure: Lost task 26.3 in stage 123.0 (TID 5290, ip-172-31-31-183.us-west-2.compute.internal, executor 105): ExecutorLostFailure (executor 105 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 4.5 GB of 4.4 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1791)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1790)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2024)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1962)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3278)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2703)
	at org.apache.spark.ml.classification.Classifier.getNumClasses(Classifier.scala:111)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:121)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [20]:
from pyspark.ml.classification import LogisticRegression

In [21]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [22]:
%%time
lrModel = lr.fit(df10)

Py4JJavaError: An error occurred while calling o4988.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 17 in stage 122.0 failed 4 times, most recent failure: Lost task 17.3 in stage 122.0 (TID 5000, ip-172-31-25-58.us-west-2.compute.internal, executor 43): ExecutorLostFailure (executor 43 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 4.4 GB of 4.4 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1791)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1790)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2024)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1962)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2131)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:518)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:488)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:278)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [23]:
from pyspark.ml.classification import DecisionTreeClassifier

In [24]:
dt = DecisionTreeClassifier()

In [25]:
%%time
dtModel = dt.fit(df10)

Py4JJavaError: An error occurred while calling o5076.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 37 in stage 124.0 failed 4 times, most recent failure: Lost task 37.3 in stage 124.0 (TID 5178, ip-172-31-25-58.us-west-2.compute.internal, executor 77): ExecutorLostFailure (executor 77 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 4.8 GB of 4.4 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1791)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1790)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2024)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1962)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3278)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2489)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2703)
	at org.apache.spark.ml.classification.Classifier.getNumClasses(Classifier.scala:111)
	at org.apache.spark.ml.classification.DecisionTreeClassifier.train(DecisionTreeClassifier.scala:102)
	at org.apache.spark.ml.classification.DecisionTreeClassifier.train(DecisionTreeClassifier.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [26]:
from pyspark.ml.classification import GBTClassifier

In [27]:
gbt = GBTClassifier(maxIter=10)

In [28]:
%%time
gbtModel = gbt.fit(df10)

Py4JJavaError: An error occurred while calling o5165.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 128.0 failed 4 times, most recent failure: Lost task 1.3 in stage 128.0 (TID 5310, ip-172-31-16-224.us-west-2.compute.internal, executor 118): ExecutorLostFailure (executor 118 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 4.5 GB of 4.4 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1791)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1790)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2024)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1962)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1168)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:118)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:105)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:127)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:297)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:55)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:179)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [29]:
from pyspark.ml.classification import LinearSVC

In [30]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

In [32]:
%%time
lsvcModel = lsvc.fit(df10)

Py4JJavaError: An error occurred while calling o5253.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 39 in stage 129.0 failed 4 times, most recent failure: Lost task 39.3 in stage 129.0 (TID 5469, ip-172-31-23-74.us-west-2.compute.internal, executor 157): ExecutorLostFailure (executor 157 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits. 4.7 GB of 4.4 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1803)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1791)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1790)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1790)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2024)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1962)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2131)
	at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1098)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1161)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:188)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:72)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
sc.stop()