In [4]:
import pyspark
from pyspark.sql import SparkSession

import os

In [16]:
# Create Spark Session
spark = SparkSession.builder.appName("Prepare Datasets").getOrCreate()

# Define data path
BASE_PATH = os.getcwd()
DATA_PATH = os.path.join(BASE_PATH, 'data')

# Load original dataset
data = spark.read.format("libsvm").load(os.path.join(DATA_PATH, 'data.txt'))

In [35]:
def split_dataset_to_train_test(dataset: pyspark.sql.DataFrame, path: str, training: float=0.7, is_json: bool = True):
    # Split dataset to train and test parts by given training ratio
    # Save datasets to given path
    (trainingData, testData) = data.randomSplit([training, 1-training])
    if is_json:
        trainingData.write.mode('overwrite').json(f'{path}/training_json')
        testData.write.mode('overwrite').json(f'{path}/test_json')
    else:
        trainingData.write.mode('overwrite').parquet(f'{path}/training_json')
        testData.write.mode('overwrite').parquet(f'{path}/test_json')
    return trainingData, testData

# Prepare datasets in parquet
train, test = split_dataset_to_train_test(dataset=data,
                                          path=DATA_PATH,
                                          training=0.7)
print(f'Saved training dataset of {train.count()} to {DATA_PATH}/training\n' \
      f'Saved test dataset of {test.count()} samples to {DATA_PATH}/test')

Saved training dataset of 70 to /Users/orz/projects/iguazio/model_example/data/training
Saved test dataset of 30 samples to /Users/orz/projects/iguazio/model_example/data/test
