In [1]:
from load_susy_into_df import susy_csv_to_df

In [2]:
path_to_csv = '/home/varsrao/Downloads/SUSY.csv'
chunksize = 100000
num_train_chunks = 5
num_test_chunks = 3

In [3]:
print('----Loading Dataset----')
ds_train_pd_df, ds_test_pd_df, target_col_name, target_col_idx, feature_col_names = susy_csv_to_df(
        path_to_csv, chunksize, num_train_chunks, num_test_chunks)
col_names = [target_col_name]+feature_col_names

----Loading Dataset----


In [4]:
print('----Creating Spark Context----')
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
ds_spark_df = sqlCtx.createDataFrame(ds_train_pd_df, schema=col_names)

----Creating Spark Context----


In [5]:
print('----Assembling Data----')
from pyspark.ml.feature import VectorAssembler
vecassembler = VectorAssembler(
        inputCols=ds_spark_df.columns[:target_col_idx]+ds_spark_df.columns[target_col_idx+1:],
        outputCol="features")
features_vec = vecassembler.transform(ds_spark_df)

features_vec = features_vec.withColumnRenamed(target_col_name, "label")
features_data = features_vec.select("label", "features")
feat_train, feat_test = features_data.randomSplit([0.8, 0.2])

----Assembling Data----


In [6]:
print('----Training Model----')
from pyspark.ml.feature import PCA
pca = PCA(k=10, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(feat_train)

----Training Model----


In [8]:
print('----Testing Model----')
pca_model.transform(feat_test).collect()[0].pca_features
print(pca_model.explainedVariance)


print('----End----')

----Testing Model----
[0.2143888254758566,0.14685310500965998,0.12659308246789863,0.11748130061622955,0.09897783738723055,0.09276907508343527,0.059203042294301435,0.0534092724470162,0.031717478230009435,0.019616299925438483]
----End----
