In [None]:
import findspark
findspark.init("/home/raj/spark/")

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def Multinomial_LogitRegression():
    

    spark = SparkSession \
        .builder \
        .appName("Logistic regression with multinomial") \
        .getOrCreate()
    
    

    


#creating dataframe	
    ad_data= spark\
    .read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv("/home/raj/Downloads/notenook/adult2.csv")
    ad_data.createOrReplaceTempView("adult")
    dataset = spark.table("adult")
    cols = dataset.columns
#print cols

####### if you would like to check how the dataframe looks like and it's columns ######

#ad_data.createOrReplaceTempView("adult")
#dataset = spark.table("adult")
#cols = dataset.columns
#print cols

############# Columns ##################

    categoricalColumns = ["workclass"]
    stages = []
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
	#In the above line for example, it takes workclass string and concatinates with the address("Index")
        encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
    # onehotencoder will take n-1 distinct values and convert to vector
        stages += [stringIndexer, encoder]
    print(stages)


# Convert label into label indices using the StringIndexer
# means in our example we have <50k , >=50k, and =50k. so <50k will get label 0.0 and >50k will get label 1.0
#, and =50 will get 2.0
    label_stringIdx = StringIndexer(inputCol = "income", outputCol = "label")
    stages += [label_stringIdx]
# Transform all features into a vector using VectorAssembler
    numericCols = ["age","hours_per_week"]
    assemblerInputs = list(map(lambda c: c + "classVec", categoricalColumns)) + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]



# Create a Pipeline.
    pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
    pipelineModel = pipeline.fit(dataset)
    dataset = pipelineModel.transform(dataset)

#print schema
    dataset.printSchema()

# Keep relevant columns
    selectedcols = ["label", "features"] + cols
    dataset = dataset.select(selectedcols)

# we can use print dataset
### Randomly split data into training and test sets. set seed for reproducibility
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

### if you want to check the count then use below code
    print(trainingData.count())
    print(testData.count())
    testData.show(truncate=False)


### so if we have 100 records then 70 will be in training and 30 will be in testing (approximately)

# Create initial LogisticRegression model and passing multinomial family
    lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10,family="multinomial")
#### here maxIter represents the no of times the logistic regression logic try finding best fit

# Train model with Training Data
    lrModel = lr.fit(trainingData)
######################### difference between multiclass/binary is coefficientMatrix ########################
#print lrModel.coefficientMatrix

############################################################################################################

# Make predictions on test data using the transform() method.
# LogisticRegression.transform() will only use the 'features' column.
    predictions = lrModel.transform(testData)

#predictions.printSchema()

# View model's predictions and probabilities of each prediction class
# You can select any columns in the above schema to view as well. For example's sake we will choose income & occupation
    selected = predictions.select("label", "prediction", "probability", "income")
#selected.printSchema()
    selected.show(truncate=False)


#binary classification

# Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") 
    print("evaluation")
    print(evaluator.evaluate(predictions))

    
    
    return evaluator.getMetricName()

#areaUnderROC  : Computes the area under the receiver operating characteristic (ROC) curve.
#areaUnderPR area under the precision-recall curve (precision as a function of recall)

#lr.explainParams()


In [None]:
Multinomial_LogitRegression()