In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SalesPrediction1026").getOrCreate()

In [2]:
from pyspark.sql.functions import lit
from pyspark.sql import functions

In [3]:
#from pyspark.sql.functions import broadcast

In [3]:
holidays_events = spark.read.csv("holidays_events.csv", header=True, inferSchema=True)
items = spark.read.csv("items.csv", header=True, inferSchema=True)
oil = spark.read.csv("oil.csv", header=True, inferSchema=True)
stores = spark.read.csv("stores.csv", header=True, inferSchema=True)
test = spark.read.csv("test.csv", header=True, inferSchema=True)
train = spark.read.csv("train.csv", header=True, inferSchema=True)
#train_sample = train.sample(False, 0.05, 1)
transactions = spark.read.csv("transactions.csv", header=True, inferSchema=True)

In [4]:
#deal with oil dataframe in pandas with backfill, then transform it to spark dataframe
oil_pandas = pd.read_csv("oil.csv")

In [5]:
#spark.sql("""SET spark.sql.autoBroadcastJoinThreshold = -1""")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [6]:
#combine test set and train set together
#add 'source' column to both train and test set so we can separate them after data cleaning
train = train.withColumn('source', lit('train'))
test = test.withColumn('source', lit('test'))
train_without_target = train.select("id", 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'source')
#use .union() to add them together
train_test_set = train_without_target.union(test)
train_test_set.show()

+---+-------------------+---------+--------+-----------+------+
| id|               date|store_nbr|item_nbr|onpromotion|source|
+---+-------------------+---------+--------+-----------+------+
|  0|2013-01-01 00:00:00|       25|  103665|       null| train|
|  1|2013-01-01 00:00:00|       25|  105574|       null| train|
|  2|2013-01-01 00:00:00|       25|  105575|       null| train|
|  3|2013-01-01 00:00:00|       25|  108079|       null| train|
|  4|2013-01-01 00:00:00|       25|  108701|       null| train|
|  5|2013-01-01 00:00:00|       25|  108786|       null| train|
|  6|2013-01-01 00:00:00|       25|  108797|       null| train|
|  7|2013-01-01 00:00:00|       25|  108952|       null| train|
|  8|2013-01-01 00:00:00|       25|  111397|       null| train|
|  9|2013-01-01 00:00:00|       25|  114790|       null| train|
| 10|2013-01-01 00:00:00|       25|  114800|       null| train|
| 11|2013-01-01 00:00:00|       25|  115267|       null| train|
| 12|2013-01-01 00:00:00|       25|  115

## Join train and holiday_events dataframe

In [7]:
train_holiday = train_test_set.join(holidays_events, 'date', 'left_outer')
train_holiday.show(2)

+-------------------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
|               date|     id|store_nbr|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|
+-------------------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
|2013-05-06 00:00:00|5202345|        1|  103520|       null| train|null|  null|       null|       null|       null|
|2013-05-06 00:00:00|5202346|        1|  105574|       null| train|null|  null|       null|       null|       null|
+-------------------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+
only showing top 2 rows



In [8]:
train_test_set.unpersist()

DataFrame[id: int, date: timestamp, store_nbr: int, item_nbr: int, onpromotion: boolean, source: string]

## Clean Oil Dataframe

In [9]:
oil_pandas = oil_pandas.fillna(method='bfill')
oil_pandas = oil_pandas.fillna(method='ffill')
oil_pandas.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [10]:
oil_pandas.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

## Transform pandas dataframe back to spark dataframe

In [11]:
oil_spark = spark.createDataFrame(oil_pandas)
oil_spark.show()

+----------+----------+
|      date|dcoilwtico|
+----------+----------+
|2013-01-01|     93.14|
|2013-01-02|     93.14|
|2013-01-03|     92.97|
|2013-01-04|     93.12|
|2013-01-07|      93.2|
|2013-01-08|     93.21|
|2013-01-09|     93.08|
|2013-01-10|     93.81|
|2013-01-11|      93.6|
|2013-01-14|     94.27|
|2013-01-15|     93.26|
|2013-01-16|     94.28|
|2013-01-17|     95.49|
|2013-01-18|     95.61|
|2013-01-21|     96.09|
|2013-01-22|     96.09|
|2013-01-23|     95.06|
|2013-01-24|     95.35|
|2013-01-25|     95.15|
|2013-01-28|     95.95|
+----------+----------+
only showing top 20 rows



In [12]:
train_holiday_oil = train_holiday.join(oil_spark, 'date', 'left_outer')
train_holiday_oil.show()

+-------------------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+
|               date|     id|store_nbr|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|dcoilwtico|
+-------------------+-------+---------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+
|2013-05-06 00:00:00|5202345|        1|  103520|       null| train|null|  null|       null|       null|       null|      95.8|
|2013-05-06 00:00:00|5202346|        1|  105574|       null| train|null|  null|       null|       null|       null|      95.8|
|2013-05-06 00:00:00|5202347|        1|  105575|       null| train|null|  null|       null|       null|       null|      95.8|
|2013-05-06 00:00:00|5202348|        1|  105577|       null| train|null|  null|       null|       null|       null|      95.8|
|2013-05-06 00:00:00|5202349|        1|  105693|       null| train|null|  null|       null|       null|       n

In [13]:
train_holiday.unpersist()

DataFrame[date: timestamp, id: int, store_nbr: int, item_nbr: int, onpromotion: boolean, source: string, type: string, locale: string, locale_name: string, description: string, transferred: boolean]

## Join train_holiday_oil with store

### Rename store.type so it would not duplicate with holiday.type

In [14]:
stores = stores.withColumnRenamed("type", "store_type")
train_holiday_oil_store = train_holiday_oil.join(stores, 'store_nbr', 'left_outer')
train_holiday_oil_store.show()

+---------+-------------------+--------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+--------+--------+----------+-------+
|store_nbr|               date|      id|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|dcoilwtico|    city|   state|store_type|cluster|
+---------+-------------------+--------+--------+-----------+------+----+------+-----------+-----------+-----------+----------+--------+--------+----------+-------+
|       31|2013-05-06 00:00:00| 5226034|  103501|       null| train|null|  null|       null|       null|       null|      95.8|Babahoyo|Los Rios|         B|     10|
|       31|2014-01-30 00:00:00|18164509| 1464035|       null| train|null|  null|       null|       null|       null|     98.25|Babahoyo|Los Rios|         B|     10|
|       31|2013-05-06 00:00:00| 5226035|  103520|       null| train|null|  null|       null|       null|       null|      95.8|Babahoyo|Los Rios|         B|     10|
|       31

In [15]:
train_holiday_oil.unpersist()

DataFrame[date: timestamp, id: int, store_nbr: int, item_nbr: int, onpromotion: boolean, source: string, type: string, locale: string, locale_name: string, description: string, transferred: boolean, dcoilwtico: double]

## Join train_holiday_oil_store and transaction

In [16]:
#try to avoid the duplicated column name:
train_holiday_oil_store_transaction = train_holiday_oil_store.join(transactions, ['date', 'store_nbr'], 'left_outer')
train_holiday_oil_store_transaction.show()

+-------------------+---------+-----+--------+-----------+------+----+------+-----------+-----------+-----------+----------+-----+---------+----------+-------+------------+
|               date|store_nbr|   id|item_nbr|onpromotion|source|type|locale|locale_name|description|transferred|dcoilwtico| city|    state|store_type|cluster|transactions|
+-------------------+---------+-----+--------+-----------+------+----+------+-----------+-----------+-----------+----------+-----+---------+----------+-------+------------+
|2013-01-04 00:00:00|       18|97380|  103501|       null| train|null|  null|       null|       null|       null|     93.12|Quito|Pichincha|         B|     16|        1238|
|2013-01-04 00:00:00|       18|97381|  105576|       null| train|null|  null|       null|       null|       null|     93.12|Quito|Pichincha|         B|     16|        1238|
|2013-01-04 00:00:00|       18|97382|  105857|       null| train|null|  null|       null|       null|       null|     93.12|Quito|Pichi

## Join items table

In [17]:
train_holiday_oil_store_transaction_item = train_holiday_oil_store_transaction.join(items, 'item_nbr', 'left_outer')
#train_holiday_oil_store_transaction_item.show()

In [18]:
train_holiday_oil_store_transaction.unpersist()

DataFrame[date: timestamp, store_nbr: int, id: int, item_nbr: int, onpromotion: boolean, source: string, type: string, locale: string, locale_name: string, description: string, transferred: boolean, dcoilwtico: double, city: string, state: string, store_type: string, cluster: int, transactions: int]

## Begin data cleaning

In [68]:
#Created a new dataframe called test, so I would not damage the previous one. 
train_holiday_oil_store_transaction_item_test = train_holiday_oil_store_transaction_item.fillna(False, subset=['onpromotion'])
#train_holiday_oil_store_transaction_item_test.show()

In [20]:
train_holiday_oil_store_transaction_item.unpersist()

DataFrame[item_nbr: int, date: timestamp, store_nbr: int, id: int, onpromotion: boolean, source: string, type: string, locale: string, locale_name: string, description: string, transferred: boolean, dcoilwtico: double, city: string, state: string, store_type: string, cluster: int, transactions: int, family: string, class: int, perishable: int]

In [93]:
#train_holiday_oil_store_transaction_item_test_003[train_holiday_oil_store_transaction_item_test_003['source'] == 'test'].count()

210654

### create a table for on_promotion, encoded

In [69]:
# from pyspark.sql import functions as F
# onpromotions = train_holiday_oil_store_transaction_item_test.select("onpromotion").distinct().rdd.flatMap(lambda x: x).collect()

# exprs = [F.when(F.col("onpromotion") == onpromotion, 1).otherwise(0).alias('onpromotion_' + onpromotion)
#          for onpromotion in onpromotions]

#train_holiday_oil_store_transaction_item_test.select("item_nbr", *exprs).show()

In [None]:
#item table with onpromotion encoded to 1 and 0
# item_onpromotion = train_holiday_oil_store_transaction_item_test.select("item_nbr", *exprs)

### Encode item_family and create a new table

In [70]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

stringIndexer = StringIndexer(inputCol="family", outputCol="familyIndex")
model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test)
indexed = model.transform(train_holiday_oil_store_transaction_item_test)

encoder = OneHotEncoder(inputCol="familyIndex", outputCol="familyIndexVec")
family_encoded = encoder.transform(indexed)

train_holiday_oil_store_transaction_item_test = train_holiday_oil_store_transaction_item_test.drop('family')
#family_encoded.show()

In [71]:
train_holiday_oil_store_transaction_item_test = encoder.transform(indexed)
#train_holiday_oil_store_transaction_item_test = train_holiday_oil_store_transaction_item_test.drop('family')

In [72]:
stringIndexer = StringIndexer(inputCol="city", outputCol="cityIndex")
model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test)
indexed = model.transform(train_holiday_oil_store_transaction_item_test)

encoder = OneHotEncoder(inputCol="cityIndex", outputCol="cityIndexVec")
train_holiday_oil_store_transaction_item_test = encoder.transform(indexed)

train_holiday_oil_store_transaction_item_test.show()

+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+---------+-----------+----------+-------+------------+---------+-----+----------+-----------+--------------+---------+---------------+
|item_nbr|               date|store_nbr|      id|onpromotion|source|      type|  locale|locale_name|         description|transferred|dcoilwtico|     city|      state|store_type|cluster|transactions|   family|class|perishable|familyIndex|familyIndexVec|cityIndex|   cityIndexVec|
+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+---------+-----------+----------+-------+------------+---------+-----+----------+-----------+--------------+---------+---------------+
|  454593|2013-02-08 00:00:00|        6| 1512928|       null| train|      null|    null|       null|                null|       null|     95.71|    Quito|  Pichinc

In [73]:
train_holiday_oil_store_transaction_item_test = train_holiday_oil_store_transaction_item_test.drop('city')

In [74]:
stringIndexer = StringIndexer(inputCol="cluster", outputCol="clusterIndex")
model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test)
indexed = model.transform(train_holiday_oil_store_transaction_item_test)

encoder = OneHotEncoder(inputCol="clusterIndex", outputCol="clusterIndexVec")
train_holiday_oil_store_transaction_item_test = encoder.transform(indexed)

train_holiday_oil_store_transaction_item_test = train_holiday_oil_store_transaction_item_test.drop('cluster')
train_holiday_oil_store_transaction_item_test.show()

+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+-----------+----------+------------+---------+-----+----------+-----------+--------------+---------+---------------+------------+---------------+
|item_nbr|               date|store_nbr|      id|onpromotion|source|      type|  locale|locale_name|         description|transferred|dcoilwtico|      state|store_type|transactions|   family|class|perishable|familyIndex|familyIndexVec|cityIndex|   cityIndexVec|clusterIndex|clusterIndexVec|
+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+-----------+----------+------------+---------+-----+----------+-----------+--------------+---------+---------------+------------+---------------+
|  454593|2013-02-08 00:00:00|        6| 1512928|       null| train|      null|    null|       null|                null|       nu

### Deal with holiday_event dataframe:

## Pay attention to how I fillna with a bolean value here

In [104]:
#fill NaN value in transferred column:
train_holiday_oil_store_transaction_item_test_002 = train_holiday_oil_store_transaction_item_test.fillna('False', subset=['type', 'locale', 'locale_name', 'description'])
train_holiday_oil_store_transaction_item_test_003 = train_holiday_oil_store_transaction_item_test_002.fillna({"transferred": False})

In [100]:
train_holiday_oil_store_transaction_item_test_003[train_holiday_oil_store_transaction_item_test_003['source'] == 'test'].count()

3370464

In [106]:
train_holiday_oil_store_transaction_item_test_003[train_holiday_oil_store_transaction_item_test_003.locale.isNull()].count()

0

In [107]:
stringIndexer = StringIndexer(inputCol="type", outputCol="typeIndex")
model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test_003)
indexed = model.transform(train_holiday_oil_store_transaction_item_test_003)

encoder = OneHotEncoder(inputCol="typeIndex", outputCol="typeIndexVec")
train_holiday_oil_store_transaction_item_test_003 = encoder.transform(indexed)

train_holiday_oil_store_transaction_item_test_003.show()

+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+-----------+----------+------------+---------+-----+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+
|item_nbr|               date|store_nbr|      id|onpromotion|source|      type|  locale|locale_name|         description|transferred|dcoilwtico|      state|store_type|transactions|   family|class|perishable|familyIndex|familyIndexVec|cityIndex|   cityIndexVec|clusterIndex|clusterIndexVec|typeIndex| typeIndexVec|
+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+-----------+----------+------------+---------+-----+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+
|  454593|2013-02-08 00:00:00|        6| 1512928|       nu

In [108]:
stringIndexer = StringIndexer(inputCol="locale_name", outputCol="locale_nameIndex")
model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test_003)
indexed = model.transform(train_holiday_oil_store_transaction_item_test_003)

encoder = OneHotEncoder(inputCol="locale_nameIndex", outputCol="locale_nameIndexVec")
train_holiday_oil_store_transaction_item_test_003 = encoder.transform(indexed)

train_holiday_oil_store_transaction_item_test_003.show()

+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+-----------+----------+------------+---------+-----+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+----------------+-------------------+
|item_nbr|               date|store_nbr|      id|onpromotion|source|      type|  locale|locale_name|         description|transferred|dcoilwtico|      state|store_type|transactions|   family|class|perishable|familyIndex|familyIndexVec|cityIndex|   cityIndexVec|clusterIndex|clusterIndexVec|typeIndex| typeIndexVec|locale_nameIndex|locale_nameIndexVec|
+--------+-------------------+---------+--------+-----------+------+----------+--------+-----------+--------------------+-----------+----------+-----------+----------+------------+---------+-----+----------+-----------+--------------+---------+---------------+------------+---------------+---------

## Combine all dataframes together, then drop original columns

In [109]:
train_holiday_oil_store_transaction_item_test_003 = train_holiday_oil_store_transaction_item_test_003.drop('type', 'description', 'transferred', 'state', 'locale', 'locale_name', 'family', 'class')

In [110]:
#Cast bolean values to int
train_holiday_oil_store_transaction_item_test_003 = train_holiday_oil_store_transaction_item_test_003.withColumn("onpromotion", train_holiday_oil_store_transaction_item_test_003["onpromotion"].cast("integer"))

In [111]:
# stringIndexer = StringIndexer(inputCol="onpromotion", outputCol="onpromotionIndex")
# model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test_003)
# indexed = model.transform(train_holiday_oil_store_transaction_item_test_003)

# encoder = OneHotEncoder(inputCol="onpromotionIndex", outputCol="onpromotionIndexVec")
# train_holiday_oil_store_transaction_item_test_004 = encoder.transform(indexed)

# train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_004.drop('onpromotion')
# train_holiday_oil_store_transaction_item_test_004.show()
train_holiday_oil_store_transaction_item_test_003.show(5)

+--------+-------------------+---------+-------+-----------+------+----------+----------+------------+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+----------------+-------------------+
|item_nbr|               date|store_nbr|     id|onpromotion|source|dcoilwtico|store_type|transactions|perishable|familyIndex|familyIndexVec|cityIndex|   cityIndexVec|clusterIndex|clusterIndexVec|typeIndex| typeIndexVec|locale_nameIndex|locale_nameIndexVec|
+--------+-------------------+---------+-------+-----------+------+----------+----------+------------+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+----------------+-------------------+
|  454593|2013-02-08 00:00:00|        6|1512928|       null| train|     95.71|         D|        1609|         0|        0.0|(32,[0],[1.0])|      0.0| (21,[0],[1.0])|         4.0| (16,[4],[1.0])|      0.0|(6,[0],[1.0])|          

In [112]:
#fillna for onpromation:
train_holiday_oil_store_transaction_item_test_003 = train_holiday_oil_store_transaction_item_test_003.fillna(0, subset=['onpromotion'])

In [114]:
train_holiday_oil_store_transaction_item_test_003[train_holiday_oil_store_transaction_item_test_003.onpromotion.isNull()].count()

0

## Deal with store_type

In [115]:
stringIndexer = StringIndexer(inputCol="store_type", outputCol="store_type_Index")
model = stringIndexer.fit(train_holiday_oil_store_transaction_item_test_003)
indexed = model.transform(train_holiday_oil_store_transaction_item_test_003)

encoder = OneHotEncoder(inputCol="store_type_Index", outputCol="store_type_Index_Vec")
train_holiday_oil_store_transaction_item_test_003 = encoder.transform(indexed)

In [116]:
train_holiday_oil_store_transaction_item_test_003 = train_holiday_oil_store_transaction_item_test_003.drop('store_type')
train_holiday_oil_store_transaction_item_test_003.show()

+--------+-------------------+---------+--------+-----------+------+----------+------------+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+----------------+-------------------+----------------+--------------------+
|item_nbr|               date|store_nbr|      id|onpromotion|source|dcoilwtico|transactions|perishable|familyIndex|familyIndexVec|cityIndex|   cityIndexVec|clusterIndex|clusterIndexVec|typeIndex| typeIndexVec|locale_nameIndex|locale_nameIndexVec|store_type_Index|store_type_Index_Vec|
+--------+-------------------+---------+--------+-----------+------+----------+------------+----------+-----------+--------------+---------+---------------+------------+---------------+---------+-------------+----------------+-------------------+----------------+--------------------+
|  454593|2013-02-08 00:00:00|        6| 1512928|          0| train|     95.71|        1609|         0|        0.0|(32,[0],[1.0])|      0.0| (21,

In [87]:
train_holiday_oil_store_transaction_item_test_003.columns

['item_nbr',
 'date',
 'store_nbr',
 'id',
 'onpromotion',
 'source',
 'dcoilwtico',
 'transactions',
 'perishable',
 'familyIndex',
 'familyIndexVec',
 'cityIndex',
 'cityIndexVec',
 'clusterIndex',
 'clusterIndexVec',
 'typeIndex',
 'typeIndexVec',
 'locale_nameIndex',
 'locale_nameIndexVec',
 'store_type_Index',
 'store_type_Index_Vec']

In [88]:
#train_holiday_oil_store_transaction_item_test_004.where(train_holiday_oil_store_transaction_item_test_004['transactions'].isNull()).count()

In [117]:
train_holiday_oil_store_transaction_item_test_003.where(train_holiday_oil_store_transaction_item_test_003['source'] == 'test').count()

3370464

In [119]:
train_holiday_oil_store_transaction_item_test_003[train_holiday_oil_store_transaction_item_test_003.dcoilwtico.isNull()].count()

39768779

##  (drop NaN oil temporary)

In [120]:
#without oil column
train_holiday_oil_store_transaction_item_test_004 = train_holiday_oil_store_transaction_item_test_003.drop('dcoilwtico')

## Separate Train set and Test set

In [121]:
train_001 = train_holiday_oil_store_transaction_item_test_004.filter(train_holiday_oil_store_transaction_item_test_003.source == 'train')
test_001 = train_holiday_oil_store_transaction_item_test_004.filter(train_holiday_oil_store_transaction_item_test_003.source == 'test')

In [122]:
train_target = train.select('id', 'unit_sales')

In [123]:
train_002 = train_001.join(train_target, 'id', 'left_outer')

In [124]:
train_003 = train_002.drop('transactions', 'dcoilwtico')

In [65]:
test.count()

3370464

In [125]:
test_001.count()

3370464

In [None]:
test_001.show()

In [None]:
train_holiday_oil_store_transaction_item_test.unpersist()
train_holiday_oil_store_transaction_item_test_002.unpersist()
#train_holiday_oil_store_transaction_item_test_003.unpersist()

In [126]:
test_002 = test_001.drop('transactions', 'dcoilwtico')

In [127]:
print(train_003.columns)

['id', 'item_nbr', 'date', 'store_nbr', 'onpromotion', 'source', 'perishable', 'familyIndex', 'familyIndexVec', 'cityIndex', 'cityIndexVec', 'clusterIndex', 'clusterIndexVec', 'typeIndex', 'typeIndexVec', 'locale_nameIndex', 'locale_nameIndexVec', 'store_type_Index', 'store_type_Index_Vec', 'unit_sales']


In [128]:
print(test_002.columns)

['item_nbr', 'date', 'store_nbr', 'id', 'onpromotion', 'source', 'perishable', 'familyIndex', 'familyIndexVec', 'cityIndex', 'cityIndexVec', 'clusterIndex', 'clusterIndexVec', 'typeIndex', 'typeIndexVec', 'locale_nameIndex', 'locale_nameIndexVec', 'store_type_Index', 'store_type_Index_Vec']


# Begin training and predicting

In [129]:
from pyspark.mllib.regression import LabeledPoint

In [131]:
train_003.dtypes

[('id', 'int'),
 ('item_nbr', 'int'),
 ('date', 'timestamp'),
 ('store_nbr', 'int'),
 ('onpromotion', 'int'),
 ('source', 'string'),
 ('perishable', 'int'),
 ('familyIndex', 'double'),
 ('familyIndexVec', 'vector'),
 ('cityIndex', 'double'),
 ('cityIndexVec', 'vector'),
 ('clusterIndex', 'double'),
 ('clusterIndexVec', 'vector'),
 ('typeIndex', 'double'),
 ('typeIndexVec', 'vector'),
 ('locale_nameIndex', 'double'),
 ('locale_nameIndexVec', 'vector'),
 ('store_type_Index', 'double'),
 ('store_type_Index_Vec', 'vector'),
 ('unit_sales', 'double')]

In [87]:
#check dtypes before preceding!!
# train_004 = train_004.withColumn("unit_sales", train_004["unit_sales"].cast("float"))
# train_004 = train_004.withColumn("store_nbr", train_004["store_nbr"].cast("float"))
# train_004 = train_004.withColumn("perishable", train_004["perishable"].cast("float"))

In [134]:
train_004 = train_003.select('unit_sales', 'store_nbr', 'perishable', 'familyIndex', \
                             'cityIndex',  'clusterIndex',  'typeIndex', \
                             'locale_nameIndex',  \
                              'store_type_Index')
temp = train_004.rdd.map(lambda line:LabeledPoint(line[0],[line[1:]]))
temp.take(5)

[LabeledPoint(2.0, [25.0,0.0,0.0,13.0,8.0,1.0,1.0,0.0]),
 LabeledPoint(4.0, [25.0,0.0,2.0,13.0,8.0,1.0,1.0,0.0]),
 LabeledPoint(2.0, [25.0,0.0,0.0,13.0,8.0,1.0,1.0,0.0]),
 LabeledPoint(1.0, [25.0,0.0,0.0,13.0,8.0,1.0,1.0,0.0]),
 LabeledPoint(2.0, [1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0])]

In [135]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import StandardScaler

In [136]:
features = train_004.rdd.map(lambda row: row[1:])
features.take(5)

[(25, 0, 0.0, 13.0, 8.0, 1.0, 1.0, 0.0),
 (25, 0, 2.0, 13.0, 8.0, 1.0, 1.0, 0.0),
 (25, 0, 0.0, 13.0, 8.0, 1.0, 1.0, 0.0),
 (25, 0, 0.0, 13.0, 8.0, 1.0, 1.0, 0.0),
 (1, 0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0)]

In [137]:
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)

In [138]:
features_transform.take(5)

[DenseVector([1.531, 0.0, 0.0, 2.2903, 1.8854, 1.2746, 0.2836, 0.0]),
 DenseVector([1.531, 0.0, 0.4524, 2.2903, 1.8854, 1.2746, 0.2836, 0.0]),
 DenseVector([1.531, 0.0, 0.0, 2.2903, 1.8854, 1.2746, 0.2836, 0.0]),
 DenseVector([1.531, 0.0, 0.0, 2.2903, 1.8854, 1.2746, 0.2836, 0.0]),
 DenseVector([0.0612, 0.0, 0.0, 0.0, 0.9427, 0.0, 0.0, 0.0])]

In [141]:
lab = train_004.rdd.map(lambda row: row[0])
lab.take(5)

[2.0, 4.0, 2.0, 1.0, 2.0]

In [142]:
transformedData = lab.zip(features_transform)
transformedData.take(5)

[(2.0, DenseVector([1.531, 0.0, 0.0, 2.2903, 1.8854, 1.2746, 0.2836, 0.0])),
 (4.0, DenseVector([1.531, 0.0, 0.4524, 2.2903, 1.8854, 1.2746, 0.2836, 0.0])),
 (2.0, DenseVector([1.531, 0.0, 0.0, 2.2903, 1.8854, 1.2746, 0.2836, 0.0])),
 (1.0, DenseVector([1.531, 0.0, 0.0, 2.2903, 1.8854, 1.2746, 0.2836, 0.0])),
 (2.0, DenseVector([0.0612, 0.0, 0.0, 0.0, 0.9427, 0.0, 0.0, 0.0]))]

In [143]:
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))

In [144]:
trainingData, testingData = transformedData.randomSplit([.8,.2],seed=1234)

In [None]:
from pyspark.mllib.regression import LinearRegressionWithSGD
linearModel = LinearRegressionWithSGD.train(trainingData,1000,.2)



In [None]:
testingData.take(10)

# Checking the Model with Metrics

In [None]:
from pyspark.mllib.evaluation import RegressionMetrics
prediObserRDDin = trainingData.map(lambda row: (float(linearModel.predict(row.features[0])),row.label))
metrics = RegressionMetrics(prediObserRDDin)
metrics.r2