In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SalesPrediction1026").getOrCreate()

In [2]:
from pyspark.sql.functions import lit
from pyspark.sql import functions

In [3]:
#from pyspark.sql.functions import broadcast

In [4]:
holidays_events = spark.read.csv("holidays_events.csv", header=True)
items = spark.read.csv("items.csv", header=True)
oil = spark.read.csv("oil.csv", header=True)
stores = spark.read.csv("stores.csv", header=True)
test = spark.read.csv("test.csv", header=True)
train = spark.read.csv("train.csv", header=True)
train_sample = train.sample(False, 0.05, 1)
transactions = spark.read.csv("transactions.csv", header=True)

In [5]:
#Check the number of partitions:
# train_rdd = train.rdd
# train_rdd.getNumPartitions()
train = train_sample.repartition(800)
# test = test.repartition(3000)
# train_rdd = train.rdd
# train_rdd.getNumPartitions()

In [6]:
# stores_rdd = stores.rdd
# stores_rdd.getNumPartitions()

In [7]:
#train = train.coalesce(20)
# test = test.coalesce(10)

In [8]:
#deal with oil dataframe in pandas with backfill, then transform it to spark dataframe
oil_pandas = pd.read_csv("oil.csv")

In [9]:
#spark.sql("""SET spark.sql.autoBroadcastJoinThreshold = -1""")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [10]:
#combine test set and train set together
#add 'source' column to both train and test set so we can separate them after data cleaning
train = train_sample.withColumn('source', lit('train'))
test = test.withColumn('source', lit('test'))
train_without_target = train.select("id", 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'source')
#use .union() to add them together
train_test_set = train_without_target.union(test)
train_test_set.show()

+---+----------+---------+--------+-----------+------+
| id|      date|store_nbr|item_nbr|onpromotion|source|
+---+----------+---------+--------+-----------+------+
|  2|2013-01-01|       25|  105575|       null| train|
| 15|2013-01-01|       25|  115850|       null| train|
| 27|2013-01-01|       25|  153267|       null| train|
| 34|2013-01-01|       25|  158956|       null| train|
| 44|2013-01-01|       25|  165705|       null| train|
| 57|2013-01-01|       25|  208498|       null| train|
| 68|2013-01-01|       25|  219150|       null| train|
|107|2013-01-01|       25|  305344|       null| train|
|111|2013-01-01|       25|  310644|       null| train|
|195|2013-01-01|       25|  457688|       null| train|
|228|2013-01-01|       25|  518091|       null| train|
|232|2013-01-01|       25|  522721|       null| train|
|250|2013-01-01|       25|  567623|       null| train|
|261|2013-01-01|       25|  582863|       null| train|
|302|2013-01-01|       25|  660503|       null| train|
|360|2013-

In [11]:
train_test_set.rdd.getNumPartitions()

42

## Join train and holiday_events dataframe

In [12]:
train_holiday = train_test_set.join(holidays_events, 'date', 'left_outer')
train_holiday.show(2)

+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
|      date|     id|store_nbr|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|
+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
|2013-03-14|2924518|        1|  168927|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924548|        1|  213653|       null| train|null|  null|       null|       null|       null|
+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
only showing top 2 rows



## Clean Oil Dataframe

In [13]:
oil_pandas = oil_pandas.fillna(method='bfill')
oil_pandas = oil_pandas.fillna(method='ffill')
oil_pandas.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [14]:
oil_pandas.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

## Transform pandas dataframe back to spark dataframe

In [15]:
oil_spark = spark.createDataFrame(oil_pandas)
oil_spark.show()

+----------+----------+
|      date|dcoilwtico|
+----------+----------+
|2013-01-01|     93.14|
|2013-01-02|     93.14|
|2013-01-03|     92.97|
|2013-01-04|     93.12|
|2013-01-07|      93.2|
|2013-01-08|     93.21|
|2013-01-09|     93.08|
|2013-01-10|     93.81|
|2013-01-11|      93.6|
|2013-01-14|     94.27|
|2013-01-15|     93.26|
|2013-01-16|     94.28|
|2013-01-17|     95.49|
|2013-01-18|     95.61|
|2013-01-21|     96.09|
|2013-01-22|     96.09|
|2013-01-23|     95.06|
|2013-01-24|     95.35|
|2013-01-25|     95.15|
|2013-01-28|     95.95|
+----------+----------+
only showing top 20 rows



In [16]:
train_holiday_oil = train_holiday.join(oil_spark, 'date', 'left_outer')
train_holiday_oil.show()

+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+
|      date|     id|store_nbr|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|dcoilwtico|
+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+
|2013-03-14|2924518|        1|  168927|       null| train|null|  null|       null|       null|       null|     93.03|
|2013-03-14|2924548|        1|  213653|       null| train|null|  null|       null|       null|       null|     93.03|
|2013-03-14|2924588|        1|  260669|       null| train|null|  null|       null|       null|       null|     93.03|
|2013-03-14|2924654|        1|  315178|       null| train|null|  null|       null|       null|       null|     93.03|
|2013-03-14|2924662|        1|  315463|       null| train|null|  null|       null|       null|       null|     93.03|
|2013-03-14|2924699|        1|  360705|       null| trai

## Join train_holiday_oil with store

### Rename store.type so it would not duplicate with holiday.type

In [17]:
stores = stores.withColumnRenamed("type", "store_type")
train_holiday_oil_store = train_holiday_oil.join(stores, 'store_nbr', 'left_outer')
train_holiday_oil_store.show()

+---------+----------+-------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+---------+------+----------+-------+
|store_nbr|      date|     id|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|dcoilwtico|     city| state|store_type|cluster|
+---------+----------+-------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+---------+------+----------+-------+
|       51|2013-03-14|2962533|  213066|       null| train|null|  null|       null|       null|       null|     93.03|Guayaquil|Guayas|         A|     17|
|       51|2013-03-14|2962562|  229368|       null| train|null|  null|       null|       null|       null|     93.03|Guayaquil|Guayas|         A|     17|
|       51|2013-03-14|2962566|  252970|       null| train|null|  null|       null|       null|       null|     93.03|Guayaquil|Guayas|         A|     17|
|       51|2013-03-14|2962601|  268676|       null| train|null|  null|      

## Join train_holiday_oil_store and transaction

In [18]:
#try to avoid the duplicated column name:
train_holiday_oil_store_transaction = train_holiday_oil_store.join(transactions, ['date', 'store_nbr'], 'left_outer')
train_holiday_oil_store_transaction.show()

+----------+---------+------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+---------+------+----------+-------+------------+
|      date|store_nbr|    id|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|dcoilwtico|     city| state|store_type|cluster|transactions|
+----------+---------+------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+---------+------+----------+-------+------------+
|2013-01-14|       51|523965|  105737|       null| train|null|  null|       null|       null|       null|     94.27|Guayaquil|Guayas|         A|     17|        1491|
|2013-01-14|       51|523979|  115693|       null| train|null|  null|       null|       null|       null|     94.27|Guayaquil|Guayas|         A|     17|        1491|
|2013-01-14|       51|523998|  123927|       null| train|null|  null|       null|       null|       null|     94.27|Guayaquil|Guayas|         A|     17|        1491|
|201

## Join items table

In [19]:
train_holiday_oil_store_transaction_item = train_holiday_oil_store_transaction.join(items, 'item_nbr', 'left_outer')
#train_holiday_oil_store_transaction_item.show()

## Write to CSV file

In [20]:
#train_holiday_oil_store_transaction_item.write.csv('train_without_clean.csv')

In [21]:
#train_holiday_oil_store_transaction_item.show(5)

In [22]:
#Count distinct values for each column
from pyspark.sql.functions import col, countDistinct
#train_holiday_oil_store_transaction_item.agg(countDistinct(col("locale")).alias("count_locale")).show()

In [23]:
#check number of null values in columns
from pyspark.sql.functions import isnan, when, count, col
#train_holiday_oil_store_transaction_item.where(col('locale').isNull()).count()

In [24]:
# null_count = [[col_name, train_holiday_oil_store_transaction_item.where(col(col_name).isNull()).count()] for col_name in train_holiday_oil_store_transaction.columns]
# print(null_count)

## Begin data cleaning

In [25]:
#Created a new dataframe called test, so I would not damage the previous one. 
train_holiday_oil_store_transaction_item_test = train_holiday_oil_store_transaction_item.fillna('False', subset=['onpromotion'])
#train_holiday_oil_store_transaction_item_test.show()

In [26]:
#train_holiday_oil_store_transaction_item_test.dtypes

### create a table for on_promotion, encoded

In [27]:
from pyspark.sql import functions as F
onpromotions = train_holiday_oil_store_transaction_item_test.select("onpromotion").distinct().rdd.flatMap(lambda x: x).collect()

exprs = [F.when(F.col("onpromotion") == onpromotion, 1).otherwise(0).alias('onpromotion_' + onpromotion)
         for onpromotion in onpromotions]

#train_holiday_oil_store_transaction_item_test.select("item_nbr", *exprs).show()

In [28]:
#item table with onpromotion encoded to 1 and 0
item_onpromotion = train_holiday_oil_store_transaction_item_test.select("item_nbr", *exprs)

### Encode item_family and create a new table

In [29]:
families = train_holiday_oil_store_transaction_item_test.select("family").distinct().rdd.flatMap(lambda x: x).collect()

exprs = [F.when(F.col("family") == family, 1).otherwise(0).alias('family' + family)
         for family in families]

train_holiday_oil_store_transaction_item_test.select("item_nbr", *exprs).show()
item_families = train_holiday_oil_store_transaction_item_test.select("item_nbr", *exprs)

+--------+--------------------+-------------------------+----------------+---------------------+---------------+---------------+-------------+----------------+---------------+---------------+------------------+-----------+--------------+-----------------+----------------+-----------+---------------+-------------+----------------------+------------------------+----------+-----------------------------+-------------+------------------+--------------------------------+-------------------+--------------+--------------+------------------+------------+----------+---------------------+-----------+
|item_nbr|familyPREPARED FOODS|familyHOME AND KITCHEN II|familyLADIESWEAR|familyLAWN AND GARDEN|familyGROCERY I|familyBABY CARE|familyPRODUCE|familyAUTOMOTIVE|familyBEVERAGES|familyHOME CARE|familyBREAD/BAKERY|familyBOOKS|familyLINGERIE|familyCELEBRATION|familyGROCERY II|familyDAIRY|familyMAGAZINES|familySEAFOOD|familyLIQUOR,WINE,BEER|familyHOME AND KITCHEN I|familyDELI|familyPLAYERS AND ELECTRONICS|

In [30]:
cities = train_holiday_oil_store_transaction_item_test.select("city").distinct().rdd.flatMap(lambda x: x).collect()
store_types = train_holiday_oil_store_transaction_item_test.select("store_type").distinct().rdd.flatMap(lambda x: x).collect()
clusters = train_holiday_oil_store_transaction_item_test.select("cluster").distinct().rdd.flatMap(lambda x: x).collect()
cities_expr = [F.when(F.col("city") == city, 1).otherwise(0).alias("city_" + city) for city in cities]
#store_types_expr = [F.when(F.col("store_type") == tp, 1).otherwise(0).alias("store_type_" + tp) for tp in store_types]
clusters_expr = [F.when(F.col("cluster") == cluster, 1).otherwise(0).alias("cluster_" + cluster) for cluster in clusters]
stores_df = train_holiday_oil_store_transaction_item_test.select("store_nbr", *cities_expr+clusters_expr)
stores_df.show()

+---------+------------+-----------+-------------+------------------+-----------+---------+----------+----------+--------------+--------------+---------+-----------+--------------+-----------+------------+----------+------------+------------+-------------+-------------+-------------+---------------+---------+----------+----------+---------+---------+----------+---------+----------+---------+---------+---------+----------+---------+----------+----------+----------+---------+
|store_nbr|city_Quevedo|city_Cuenca|city_Guaranda|city_Santo Domingo|city_Playas|city_Puyo|city_Quito|city_Manta|city_Latacunga|city_Guayaquil|city_Loja|city_Ibarra|city_El Carmen|city_Ambato|city_Machala|city_Daule|city_Cayambe|city_Salinas|city_Libertad|city_Babahoyo|city_Riobamba|city_Esmeraldas|cluster_7|cluster_15|cluster_11|cluster_3|cluster_8|cluster_16|cluster_5|cluster_17|cluster_6|cluster_9|cluster_1|cluster_10|cluster_4|cluster_12|cluster_13|cluster_14|cluster_2|
+---------+------------+-----------+------

### Deal with holiday_event dataframe:

In [31]:
#fill NaN value in transferred column:
train_holiday_oil_store_transaction_item_test_002 = train_holiday_oil_store_transaction_item_test.fillna('False', subset=['transferred', 'type', 'locale', 'locale_name', 'description'])
#train_holiday_oil_store_transaction_item_test_002.show()

In [32]:
#encode holiday_event and create a new dataframe for it
train_holiday_oil_store_transaction_item_test_003 = train_holiday_oil_store_transaction_item_test_002.filter(train_holiday_oil_store_transaction_item_test_002['transferred'] == 'False')
#train_holiday_oil_store_transaction_item_test_003.show()

In [33]:
#train_holiday_oil_store_transaction_item_test_003[train_holiday_oil_store_transaction_item_test_003['transferred'] == 'True'].show()

In [34]:
types = train_holiday_oil_store_transaction_item_test_003.select("type").distinct().rdd.flatMap(lambda x: x).collect()
locale_names = train_holiday_oil_store_transaction_item_test_003.select("locale_name").distinct().rdd.flatMap(lambda x: x).collect()
types_expr = [F.when(F.col("type") == ty, 1).otherwise(0).alias("type_" + ty) for ty in types]
locale_names_expr = [F.when(F.col("locale_name") == locale_name, 1).otherwise(0).alias("locale_name_" + locale_name) for locale_name in locale_names]
holiday_events_df = train_holiday_oil_store_transaction_item_test_003.select("date", *types_expr+locale_names_expr)
holiday_events_df.show()

+----------+----------+----------+------------+-------------+-----------+---------------+-------------+-------------------+------------------+--------------------+--------------------+-------------------------+-----------------+----------------+-----------------+-------------------+-----------------+---------------------+---------------------+----------------+-----------------------+--------------------+------------------+---------------------+------------------+-------------------+-------------------+-------------------+--------------------+------------------------------------------+--------------------+----------------------+
|      date|type_False|type_Event|type_Holiday|type_Transfer|type_Bridge|type_Additional|type_Work Day|locale_name_Quevedo|locale_name_Cuenca|locale_name_Cotopaxi|locale_name_Guaranda|locale_name_Santo Domingo|locale_name_False|locale_name_Puyo|locale_name_Quito|locale_name_Ecuador|locale_name_Manta|locale_name_Latacunga|locale_name_Guayaquil|locale_name_Loja|lo

## Combine all dataframes together, then drop original columns

In [35]:
#combine train_holiday_oil_store_transaction_item_test_003 with item
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_003.join(item_families, 'item_nbr', 'left_outer')
#train_holiday_oil_store_transaction_item_test_004.show()

In [36]:
#train_holiday_oil_store_transaction_item_test_004.columns

In [37]:
#drop original columns
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.drop('family', 'class')
train_holiday_oil_store_transaction_item_test_004.show()

+--------+----------+---------+-------+-----------+------+-----+------+-----------+-----------+-----------+----------+---------+------+----------+-------+------------+----------+--------------------+-------------------------+----------------+---------------------+---------------+---------------+-------------+----------------+---------------+---------------+------------------+-----------+--------------+-----------------+----------------+-----------+---------------+-------------+----------------------+------------------------+----------+-----------------------------+-------------+------------------+--------------------------------+-------------------+--------------+--------------+------------------+------------+----------+---------------------+-----------+
|item_nbr|      date|store_nbr|     id|onpromotion|source| type|locale|locale_name|description|transferred|dcoilwtico|     city| state|store_type|cluster|transactions|perishable|familyPREPARED FOODS|familyHOME AND KITCHEN II|familyLADI

In [38]:
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.join(item_onpromotion, 'item_nbr', 'left_outer')
#train_holiday_oil_store_transaction_item_test_004.show()

In [39]:
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.drop('onpromotion')
#train_holiday_oil_store_transaction_item_test_004.show()

In [40]:
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.join(stores_df, 'store_nbr', 'left_outer')
#train_holiday_oil_store_transaction_item_test_004.show()
#trying to join on columns:
# train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.repartition("store_nbr")
# stores_df = stores_df.repartition("store_nbr")
# train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.join(stores_df, 'store_nbr', 'left_outer')

In [41]:
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.drop('city', 'state', 'store_type', 'cluster')
#train_holiday_oil_store_transaction_item_test_004.show()

In [42]:
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.join(holiday_events_df, 'date', 'left_outer')
#train_holiday_oil_store_transaction_item_test_004.show()

In [43]:
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.drop('type', 'locale', 'locale_name', 'description', 'transferred')
train_holiday_oil_store_transaction_item_test_004.show()

Py4JJavaError: An error occurred while calling o788.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 187.0 failed 1 times, most recent failure: Lost task 2.0 in stage 187.0 (TID 13757, localhost, executor driver): java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:205)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:158)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:554)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.writeValue(UnsafeRowSerializer.scala:69)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:239)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:336)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2853)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2366)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:245)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: No space left on device
	at java.io.FileOutputStream.writeBytes(Native Method)
	at java.io.FileOutputStream.write(FileOutputStream.java:326)
	at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:205)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:158)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at java.io.DataOutputStream.write(DataOutputStream.java:107)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:554)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$1.writeValue(UnsafeRowSerializer.scala:69)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:239)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
#train_holiday_oil_store_transaction_item_test_004.rdd.partitions.size

In [None]:
stores_df.rdd.getNumPartitions()