In [5]:
import findspark
findspark.init("/home/raj/spark/")

In [6]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

def Decision_tree():

    spark = SparkSession \
        .builder \
        .appName("Decision tree") \
        .getOrCreate()
	
#creating dataframe	
    ad_data= spark\
    .read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv("/home/raj/Downloads/notenook/adult2.csv")
    ad_data.createOrReplaceTempView("adult")
    dataset = spark.table("adult")
    cols = dataset.columns
#print cols

####### if you would like to check how the dataframe looks like and it's columns ######

#ad_data.createOrReplaceTempView("adult")
#dataset = spark.table("adult")
#cols = dataset.columns
#print cols

############# Columns ##################


    categoricalColumns = ["workclass"]
    stages = []
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
	#In the above line for example, it takes workclass string and concatinates with the address("Index")
        encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
     # onehotencoder will take n-1 distinct values and convert to vector
        stages += [stringIndexer, encoder]
#print stages


#
# Convert label into label indices using the StringIndexer
    label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")
    stages += [label_stringIdx]
# Transform all features into a vector using VectorAssembler
    numericCols = ["age","hours_per_week"]
    assemblerInputs = list(map(lambda c: c + "classVec", categoricalColumns)) + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]

# Create a Pipeline.
    pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
    pipelineModel = pipeline.fit(dataset)
    dataset = pipelineModel.transform(dataset)

# Keep relevant columns
    selectedcols = ["label", "features"] + cols
    dataset = dataset.select(selectedcols)

# we can use print dataset
### Randomly split data into training and test sets. set seed for reproducibility
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
### so if we have 100 records then 70 will be in training and 30 will be in testing (approximately)


# Create initial Decision Tree Model
    dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
    dtModel = dt.fit(trainingData)

    print("numNodes = ", dtModel.numNodes) # total number of nodes from root to leaf with longest path
    print("depth = ", dtModel.depth)    #max depth of the nodes

# Make predictions on test data using the Transformer.transform() method.
    predictions = dtModel.transform(testData)

#print schema
    predictions.printSchema()


# View model's predictions and probabilities of each prediction class
#selecting the required columns
    selected = predictions.select("label", "prediction", "probability", "age","income")

#Printing the schema
    selected.printSchema()

#printing the predicted data
    selected.show(truncate=False)

# Evaluate model
    evaluator = BinaryClassificationEvaluator()

    print("evaluation")
    print(evaluator.evaluate(predictions))
    
    return evaluator.getMetricName()


In [7]:
Decision_tree()

numNodes =  15
depth =  3
root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: double (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: double (nullable = true)
 |-- capital_loss: double (nullable = true)
 |-- hours_per_week: double (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

root
 |-- label: double (nullable = true)
 |-- prediction: double (nullable = true)
 |-- probability: vector (nullable = true)
 |-- age: in

'areaUnderROC'