In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .master("local") \
        .appName("SalesPrediction") \
        .getOrCreate()

In [8]:
import pyspark
sc = pyspark.SparkContext(appName="sales_prediction")

In [158]:
pyspark

<module 'pyspark' from '/usr/local/opt/apache-spark/libexec/python/pyspark/__init__.py'>

In [4]:
holiday_events = spark.read.csv("holidays_events.csv", header=True)
items = spark.read.csv("items.csv", header=True)
oil = spark.read.csv("oil.csv", header=True)
stores = spark.read.csv("stores.csv", header=True)
test = spark.read.csv("test.csv", header=True)
train = spark.read.csv("train.csv", header=True)
transactions = spark.read.csv("transactions.csv", header=True)

In [92]:
# holiday_events.createOrReplaceTempView("holiday_events_sql")
# items.createOrReplaceTempView("items_sql")
# oil.createOrReplaceTempView("oil_sql")
# stores.createOrReplaceTempView("stores_sql")
# #test.createOrReplaceTempView("test_sql")
# train.createOrReplaceTempView("train_sql")
# transactions.createOrReplaceTempView("transactions_sql")

In [5]:
train_without_target = train.select("id", 'date', 'store_nbr', 'item_nbr', 'onpromotion')
train_without_target.show()

+---+----------+---------+--------+-----------+
| id|      date|store_nbr|item_nbr|onpromotion|
+---+----------+---------+--------+-----------+
|  0|2013-01-01|       25|  103665|       null|
|  1|2013-01-01|       25|  105574|       null|
|  2|2013-01-01|       25|  105575|       null|
|  3|2013-01-01|       25|  108079|       null|
|  4|2013-01-01|       25|  108701|       null|
|  5|2013-01-01|       25|  108786|       null|
|  6|2013-01-01|       25|  108797|       null|
|  7|2013-01-01|       25|  108952|       null|
|  8|2013-01-01|       25|  111397|       null|
|  9|2013-01-01|       25|  114790|       null|
| 10|2013-01-01|       25|  114800|       null|
| 11|2013-01-01|       25|  115267|       null|
| 12|2013-01-01|       25|  115611|       null|
| 13|2013-01-01|       25|  115693|       null|
| 14|2013-01-01|       25|  115720|       null|
| 15|2013-01-01|       25|  115850|       null|
| 16|2013-01-01|       25|  115891|       null|
| 17|2013-01-01|       25|  115892|     

In [6]:
from pyspark.sql.functions import lit
train_without_target = train_without_target.withColumn('source', lit('train'))

In [7]:
train_without_target.show()

+---+----------+---------+--------+-----------+------+
| id|      date|store_nbr|item_nbr|onpromotion|source|
+---+----------+---------+--------+-----------+------+
|  0|2013-01-01|       25|  103665|       null| train|
|  1|2013-01-01|       25|  105574|       null| train|
|  2|2013-01-01|       25|  105575|       null| train|
|  3|2013-01-01|       25|  108079|       null| train|
|  4|2013-01-01|       25|  108701|       null| train|
|  5|2013-01-01|       25|  108786|       null| train|
|  6|2013-01-01|       25|  108797|       null| train|
|  7|2013-01-01|       25|  108952|       null| train|
|  8|2013-01-01|       25|  111397|       null| train|
|  9|2013-01-01|       25|  114790|       null| train|
| 10|2013-01-01|       25|  114800|       null| train|
| 11|2013-01-01|       25|  115267|       null| train|
| 12|2013-01-01|       25|  115611|       null| train|
| 13|2013-01-01|       25|  115693|       null| train|
| 14|2013-01-01|       25|  115720|       null| train|
| 15|2013-

In [8]:
test = test.withColumn('source', lit('test'))
test.createOrReplaceTempView("test_sql")

In [9]:
test.show()

+---------+----------+---------+--------+-----------+------+
|       id|      date|store_nbr|item_nbr|onpromotion|source|
+---------+----------+---------+--------+-----------+------+
|125497040|2017-08-16|        1|   96995|      False|  test|
|125497041|2017-08-16|        1|   99197|      False|  test|
|125497042|2017-08-16|        1|  103501|      False|  test|
|125497043|2017-08-16|        1|  103520|      False|  test|
|125497044|2017-08-16|        1|  103665|      False|  test|
|125497045|2017-08-16|        1|  105574|      False|  test|
|125497046|2017-08-16|        1|  105575|      False|  test|
|125497047|2017-08-16|        1|  105576|      False|  test|
|125497048|2017-08-16|        1|  105577|      False|  test|
|125497049|2017-08-16|        1|  105693|      False|  test|
|125497050|2017-08-16|        1|  105737|      False|  test|
|125497051|2017-08-16|        1|  105857|      False|  test|
|125497052|2017-08-16|        1|  106716|      False|  test|
|125497053|2017-08-16|  

In [10]:
from pyspark.sql import functions
train_test_set = train_without_target.union(test)

In [11]:
train_test_set.show()

+---+----------+---------+--------+-----------+------+
| id|      date|store_nbr|item_nbr|onpromotion|source|
+---+----------+---------+--------+-----------+------+
|  0|2013-01-01|       25|  103665|       null| train|
|  1|2013-01-01|       25|  105574|       null| train|
|  2|2013-01-01|       25|  105575|       null| train|
|  3|2013-01-01|       25|  108079|       null| train|
|  4|2013-01-01|       25|  108701|       null| train|
|  5|2013-01-01|       25|  108786|       null| train|
|  6|2013-01-01|       25|  108797|       null| train|
|  7|2013-01-01|       25|  108952|       null| train|
|  8|2013-01-01|       25|  111397|       null| train|
|  9|2013-01-01|       25|  114790|       null| train|
| 10|2013-01-01|       25|  114800|       null| train|
| 11|2013-01-01|       25|  115267|       null| train|
| 12|2013-01-01|       25|  115611|       null| train|
| 13|2013-01-01|       25|  115693|       null| train|
| 14|2013-01-01|       25|  115720|       null| train|
| 15|2013-

In [12]:
train_test_set_002 = train_test_set.join(holiday_events, ['date'], 'outer')

In [13]:
train_test_set_002.show()

+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
|      date|     id|store_nbr|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|
+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
|2013-03-14|2924449|        1|  103520|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924450|        1|  103665|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924451|        1|  105574|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924452|        1|  105575|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924453|        1|  105577|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924454|        1|  105737|       null| train|null|  null|       null|       null|       null|
|2013-03-14|2924455|        1|  10585

In [85]:
# train_test_set_002.where(train_test_set_002.locale.isNull()).count()

109301859

In [87]:
#train_test_set_002.createOrReplaceTempView("train_test_set_002_sql")

In [14]:
#spark.sql("select * from train_test_set_002_sql where locale is not null").show()

In [15]:
#train_test_set_002.join(items, ['item_nbr'], 'outer')

In [16]:
# train_test_set_003 = spark.sql("""
#         SELECT * FROM 
#         train_test_set_002_sql t
#         LEFT JOIN items_sql i ON t.item_nbr = i.item_nbr
# """)
train_test_set_003 = train_test_set_002.join(items, ['item_nbr'], 'outer')
#train_test_set_003.createOrReplaceTempView("train_test_set_003_sql")
train_test_set_003.show()

+--------+----------+--------+---------+-----------+------+----+------+-----------+-----------+-----------+------------+-----+----------+
|item_nbr|      date|      id|store_nbr|onpromotion|source|type|locale|locale_name|description|transferred|      family|class|perishable|
+--------+----------+--------+---------+-----------+------+----+------+-----------+-----------+-----------+------------+-----+----------+
| 1040170|2013-03-14| 2925286|        1|       null| train|null|  null|       null|       null|       null|FROZEN FOODS| 2222|         0|
| 1040170|2013-03-14| 2927457|        3|       null| train|null|  null|       null|       null|       null|FROZEN FOODS| 2222|         0|
| 1040170|2013-03-14| 2931493|        7|       null| train|null|  null|       null|       null|       null|FROZEN FOODS| 2222|         0|
| 1040170|2013-03-14| 2932591|        8|       null| train|null|  null|       null|       null|       null|FROZEN FOODS| 2222|         0|
| 1040170|2013-03-14| 2941363|    

In [17]:
# train_test_set_004 = spark.sql("""
#         SELECT * FROM
#         train_test_set_003_sql t
#         LEFT JOIN stores_sql s ON t.store_nbr = s.store_nbr
# """)
train_test_set_004 = train_test_set_003.join(stores, ['store_nbr'], 'outer')
train_test_set_004.show()

Py4JJavaError: An error occurred while calling o86.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 112 in stage 23.0 failed 1 times, most recent failure: Lost task 112.0 in stage 23.0 (TID 648, localhost, executor driver): java.io.FileNotFoundException: /private/var/folders/hh/jzkw5p4s4hn28ktwmn6c4vgc0000gn/T/blockmgr-266508db-3aa4-42aa-95e1-15ae8a15902e/33/shuffle_10_112_0.index.4255a36c-0190-4707-ae1f-a80e90812b44 (No space left on device)
	at java.io.FileOutputStream.open0(Native Method)
	at java.io.FileOutputStream.open(FileOutputStream.java:270)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:162)
	at org.apache.spark.shuffle.IndexShuffleBlockResolver.writeIndexFileAndCommit(IndexShuffleBlockResolver.scala:144)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:164)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:336)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2853)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2366)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:245)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: /private/var/folders/hh/jzkw5p4s4hn28ktwmn6c4vgc0000gn/T/blockmgr-266508db-3aa4-42aa-95e1-15ae8a15902e/33/shuffle_10_112_0.index.4255a36c-0190-4707-ae1f-a80e90812b44 (No space left on device)
	at java.io.FileOutputStream.open0(Native Method)
	at java.io.FileOutputStream.open(FileOutputStream.java:270)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:162)
	at org.apache.spark.shuffle.IndexShuffleBlockResolver.writeIndexFileAndCommit(IndexShuffleBlockResolver.scala:144)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:164)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [140]:
#train_test_set_004.createOrReplaceTempView("train_test_set_004_sql")

## Deal with NaN values in oil table (pandas), then create a spark dataframe from pandas with it.

In [125]:
oil_pandas = pd.read_csv("oil.csv")
oil_pandas['dcoilwtico'] = oil_pandas['dcoilwtico'].fillna(method="ffill")
oil_pandas['dcoilwtico'] = oil_pandas['dcoilwtico'].fillna(method="bfill")
oil_pandas.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

In [127]:
#oil_pandas = pd.DataFrame(oil_pandas)

In [154]:
oil_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
date          1218 non-null object
dcoilwtico    1218 non-null float64
dtypes: float64(1), object(1)
memory usage: 19.1+ KB


In [135]:
sqlCtx = SQLContext(sc)
oil = sqlCtx.createDataFrame(oil_pandas)

In [137]:
oil.createOrReplaceTempView("oil_sql")

In [142]:
# train_test_set_005 = spark.sql("""
#         SELECT * FROM
#         train_test_set_004_sql t
#         LEFT JOIN oil_sql o ON t.date = o.date
# """)
train_test_set_005 = train_test_set_004.join(oil, ['date'], 'outer')
train_test_set_005.show()
#train_test_set_005.createOrReplaceTempView("train_test_set_005_sql")

## Join transaction table

In [146]:
train_test_set_005.show()

+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+--------+------------+-----+----------+---------+-----+---------+----+-------+----------+----------+
|      date|     id|store_nbr|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|item_nbr|      family|class|perishable|store_nbr| city|    state|type|cluster|      date|dcoilwtico|
+----------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+--------+------------+-----+----------+---------+-----+---------+----+-------+----------+----------+
|2013-03-14|2924449|        1|  103520|       null| train|null|  null|       null|       null|       null|  103520|   GROCERY I| 1028|         0|        1|Quito|Pichincha|   D|     13|2013-03-14|     93.03|
|2013-03-14|2924450|        1|  103665|       null| train|null|  null|       null|       null|       null|  103665|BREAD/BAKERY| 2712|         1|        1|Quito|Pichincha| 

In [144]:
spark.sql("""
        SELECT * FROM
        train_test_set_005_sql t
        LEFT JOIN transactions_sql r ON t.date = r.date AND t.store_nbr = r.store_nbr
""").show()

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



AnalysisException: "Reference 't.date' is ambiguous, could be: date#651, date#1059.; line 4 pos 40"