In [1]:
from load_susy_into_df import susy_csv_to_df

In [2]:
path_to_csv = '/home/varsrao/Downloads/SUSY.csv'
chunksize = 100000
num_train_chunks = 5
num_test_chunks = 3

In [3]:
print('----Loading Dataset----')
ds_train_pd_df, ds_test_pd_df, target_col_name, target_col_idx, feature_col_names = susy_csv_to_df(
        path_to_csv, chunksize, num_train_chunks, num_test_chunks)
col_names = [target_col_name]+feature_col_names

----Loading Dataset----


In [4]:
print('----Creating Spark Context----')
sqlCtx = SQLContext(sc)
ds_spark_df = sqlCtx.createDataFrame(ds_train_pd_df, schema=col_names)

----Creating Spark Context----


In [5]:
print('----Assembling Data----')
from pyspark.ml.feature import VectorAssembler
vecassembler = VectorAssembler(
        inputCols=ds_spark_df.columns[:target_col_idx]+ds_spark_df.columns[target_col_idx+1:],
        outputCol="features")
features_vec = vecassembler.transform(ds_spark_df)
features_vec = features_vec.withColumnRenamed(target_col_name, "label")
features_data = features_vec.select("label", "features")
feat_train, feat_test = features_data.randomSplit([0.8, 0.2])

----Assembling Data----


In [6]:
print('----Training Model----')
from pyspark.ml.classification import LogisticRegression
lrm = LogisticRegression(labelCol="label", featuresCol="features", maxIter=100).fit(feat_train)
import matplotlib.pyplot as plt
import numpy as np

trainingSummary = lrm.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))
print('Training Accuracy' + str(trainingSummary.accuracy))

----Training Model----


<Figure size 640x480 with 1 Axes>

Training set areaUnderROC: 0.8587467199154661
Training Accuracy0.7896062110932364


In [7]:
print('----Testing Model----')
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = lrm.transform(feat_test)
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(predictions))

print('----End----')

----Testing Model----
Test Area Under ROC 0.8585428263533798
----End----
