In [None]:
import findspark
findspark.init("/home/raj/spark/")

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorIndexer


def GBT_classifier():

    spark = SparkSession.builder.appName("Gbt_classifier").getOrCreate() 

# Load and parse the data file, converting it to a DataFrame.
    data = spark.read.format("csv")\
      .option("header", "true")\
      .option("inferSchema", "true")\
      .load("/home/raj/Downloads/notenook/adult2.csv")



    data.show()

    categoricalColumns = ["workclass", "occupation"]

    stages = []
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
	#.fit(ad_data)
	#df_numeric = stringIndexer.transform(ad_data)
	#df_numeric.repartition(1).repartition(1).write.csv('indexer')
	#print df_numeric.select('workclass','workclassIndex').show(5)
	#In the above line for example, it takes workclass string and concatinates with the address("Index")
        encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
	#print encoder.outputCol
        stages += [stringIndexer, encoder]

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
    label_stringIdx = StringIndexer(inputCol = "income", outputCol = "labelindex")

    stages += [label_stringIdx]

    numericCols = ["age", "hours_per_week"]

    assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) 

    assemblerInputs=list(assemblerInputs) + numericCols

    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

    stages += [assembler]

# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.

    pipeline = Pipeline(stages=stages)

    pipelineModel = pipeline.fit(data)
    dataset = pipelineModel.transform(data)

    cols = data.columns

    selectedcols = ["labelindex", "features"] + cols
    dataset = dataset.select(selectedcols)



    dataset.printSchema()
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

    featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)
    print(featureIndexer)



    featuredf=featureIndexer.transform(dataset)

    featuredf.show()

# Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = featuredf.randomSplit([0.7, 0.3], seed = 100)

    trainingData.show()

    testData.show()

# Train a GBT model.
    gbt =GBTClassifier(labelCol="labelindex", featuresCol="indexedFeatures", maxIter=10)



# Chain indexers and GBT in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

# Make predictions.
    predictions = model.transform(testData)

    predictions.printSchema()

    trainingData.show()

# Select example rows to display.
    predictions.select("prediction", "labelindex", "features","probability","rawPrediction").show(5)

# Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="labelindex", predictionCol="prediction", metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)

    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[1]

    # summary only
    return gbtModel

In [None]:
GBT_classifier()