In [1]:
from load_susy_into_df import susy_csv_to_df

In [2]:
path_to_csv = '/home/varsrao/Downloads/SUSY.csv'
chunksize = 100000
num_train_chunks = 4
num_test_chunks = 3

In [3]:
print('----Loading Dataset----')
ds_train_pd_df, ds_test_pd_df, target_col_name, target_col_idx, feature_col_names = susy_csv_to_df(
        path_to_csv, chunksize, num_train_chunks, num_test_chunks)
col_names = [target_col_name]+feature_col_names
import pandas as pd
ds_merged_pd_df = pd.concat([ds_train_pd_df, ds_test_pd_df])
train_frac = 1. * num_train_chunks/(num_test_chunks+num_train_chunks)

----Loading Dataset----


In [4]:
print('----Creating Spark Context----')
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

----Creating Spark Context----


In [5]:
print('----Creating Spark DataFrame----')
ds_spark_df = sqlCtx.createDataFrame(ds_merged_pd_df, schema=col_names)

----Creating Spark DataFrame----


In [6]:
print('----Assembling Data----')
from pyspark.ml.feature import VectorAssembler
vecassembler = VectorAssembler(
        inputCols=ds_spark_df.columns[:target_col_idx]+ds_spark_df.columns[target_col_idx+1:],
        outputCol="features")
features_vec = vecassembler.transform(ds_spark_df)
features_vec = features_vec.withColumnRenamed(target_col_name, "label")
features_data = features_vec.select("label", "features")
feat_train, feat_test = features_data.randomSplit([train_frac, 1-train_frac])

----Assembling Data----


In [7]:
print('----Training Model----')
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol="features", k=2, maxIter=1000000)
kmeans_model = kmeans.fit(feat_train)

----Training Model----


In [8]:
print('----Testing Model----')
from pyspark.ml.evaluation import ClusteringEvaluator
predictions = kmeans_model.transform(feat_test)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
centers = kmeans_model.clusterCenters()
feat = feat_train.collect()[0].features

print('----End----')

----Testing Model----
Silhouette with squared euclidean distance = 0.29049559584989
----End----
