In [1]:
%sh curl -O 'https://raw.githubusercontent.com/bsullins/bensullins.com-freebies/master/CogsleyServices-SalesData-US.csv'

# download the file from a location

In [2]:
%fs ls 'file:/databricks/driver'



path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/ganglia/,ganglia/,4096
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/derby.log,derby.log,746
file:/databricks/driver/CogsleyServices-SalesData-US.csv,CogsleyServices-SalesData-US.csv,2162356


In [3]:
# read and clean data
path = 'file:/databricks/driver/CogsleyServices-SalesData-US.csv'

# Use the Spark CSV datasource with options specifying:
# - First line of file is a header
# - Automatically infer the schema of the data




data = sqlContext.read.format('CSV').options(header = 'true', inferschema = 'true').load(path)
data.cache()
data = data.dropna()
display(data)


RowID,OrderID,OrderDate,OrderMonthYear,Quantity,Quote,DiscountPct,Rate,SaleAmount,CustomerName,CompanyName,Sector,Industry,City,ZipCode,State,Region,ProjectCompleteDate,DaystoComplete,ProductKey,ProductCategory,ProductSubCategory,Consultant,Manager,HourlyWage,RowCount,WageMargin
1914,13729,2009-01-01,2009-01-01,9,1800,0.08,200,1640.96,Matt Bertelsons,The Priceline Group Inc.,Miscellaneous,Business Services,Bowie,20715,Maryland,East,2009-01-03,2,Development - Big Data,Development,Python,Noah Smith,Allen Young,59,1,0.71
4031,28774,2009-01-01,2009-01-01,32,6400,0.1,200,5707.67,Jessica Thornton,Garmin Ltd.,Capital Goods,Industrial Machinery/Components,McKeesport,15131,Pennsylvania,East,2009-01-02,1,Development - Big Data,Development,Market Research,Daniel Tusk,Allen Young,45,1,0.78
1279,9285,2009-01-02,2009-01-01,3,480,0.06,160,447.11,David O'Rourke,Wynn Resorts Limited,Consumer Services,Hotels/Resorts,Prior Lake,55372,Minnesota,Central,2009-01-04,2,Development - Java,Development,Python,Mason Gibson,Josh Martinez,71,1,0.56
5272,37537,2009-01-02,2009-01-01,4,500,0.0,125,495.47,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-02,0,Training - Development,Training,Java,William Bufont,Bob Turner,62,1,0.5
5273,37537,2009-01-02,2009-01-01,43,5375,0.07,125,4953.46,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-04,2,Training - Development,Training,Strategy,Liam Franklin,Bob Turner,52,1,0.58
5274,37537,2009-01-02,2009-01-01,32,6400,0.05,200,6024.92,Alan Brumley,Bed Bath & Beyond Inc.,Consumer Services,Home Furnishings,Napa,94559,California,West,2009-01-09,7,Development - Big Data,Development,.Net,Emma Watson,Bob Turner,67,1,0.67
6224,44069,2009-01-02,2009-01-01,16,1760,0.09,110,1587.09,Elizabeth Hansen,Fastenal Company,Consumer Services,RETAIL: Building Materials,Montebello,90640,California,West,2009-01-04,2,Development - Python,Development,Business Model,Sophia Dixon,Bob Turner,71,1,0.35
6225,44069,2009-01-02,2009-01-01,43,4730,0.08,110,4312.18,Elizabeth Hansen,Fastenal Company,Consumer Services,RETAIL: Building Materials,Montebello,90640,California,West,2009-01-02,0,Development - Python,Development,SQL,Mia Moore,Bob Turner,51,1,0.54
1074,7909,2009-01-03,2009-01-01,29,3480,0.03,120,3345.1,Alex Grayson,C.H. Robinson Worldwide Inc.,Transportation,Oil Refining/Marketing,Lake Oswego,97035,Oregon,West,2009-01-04,1,Development - Business Logic,Development,Market Research,Abigail Young,Bob Turner,50,1,0.58
1315,9637,2009-01-03,2009-01-01,12,1800,0.08,150,1641.04,Andy Willingham,DIRECTV,Consumer Services,Telecommunications Equipment,Baton Rouge,70802,Louisiana,South,2009-01-05,2,Consulting - Business Model,Consulting,Java,Madison Hill,Frank Mitchell,58,1,0.61


In [4]:
# Aggregate 

summary = data.select('orderMonthYear', 'saleAmount').groupBy('orderMonthYear').sum().orderBy('orderMonthYear').toDF('orderMonthYear', 'saleAmount')



In [5]:


# Convert OrderMonthYear to integer type
results = summary.map(lambda r: (int(r.OrderMonthYear.replace('-','')), r.SaleAmount)).toDF(["OrderMonthYear","SaleAmount"])

In [6]:
# convert dataframes to features and labels

from pyspark.mllib.regression import LabeledPoint
data = results.select("OrderMonthYear", "SaleAmount").map(lambda r: LabeledPoint(r[1], [r[0]])).toDF()

In [7]:
# building a linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()

 
# Fit 2 models, using different regularization parameters
modelA = lr.fit(data, {lr.regParam:0.0})
modelB = lr.fit(data, {lr.regParam:100.0})

# make predictions
predictionA = modelA.transform(data)
predictionB = modelB.transform(data)

In [8]:
# check models for accuracy

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName = 'rmse')

rmse = evaluator.evaluate(predictionA)
print("ModelA: Root Mean Squared Error = " + str(RMSE)) 
rmse = evaluator.evaluate(predictionB)
print("ModelB: Root Mean Squared Error = " + str(RMSE)) 

In [9]:

# define column names
cols = ["OrderMonthYear", "SaleAmount", "Prediction"]

# use parallelize to create an RDD
# use map() with lambda to parse features
tableA = sc.parallelize(\
            predictionsA.map(lambda r: (float(r.features[0]), r.label, r.prediction)).collect()\
         ).toDF(cols) 

# repeate for modelB
tableB = sc.parallelize(\
            predictionsB.map(lambda r: (float(r.features[0]), r.label, r.prediction)).collect()\
         ).toDF(cols) 

# check results
# display(tableA)

# save results as tables
tableA.write.saveAsTable('predictionsA', mode='overwrite')
print "Created predictionsA table"

tableB.write.saveAsTable('predictionsB', mode='overwrite')
print "Created predictionsB table"


In [10]:
%sql 
select 
    a.OrderMonthYear,
    a.SaleAmount,
    a.prediction as ModelA,
    b.prediction as ModelB
from predictionsA a
join predictionsB b on a.OrderMonthYear = b.OrderMonthYear
