In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = [1, 2, 3, 4]
distdata = sc.parallelize(data)

In [3]:
sc.version

'2.0.1'

In [4]:
smaller = distdata.filter(lambda x: x < 3)
smaller.first()

1

In [5]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
import pyspark.ml as ml

In [6]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])


In [7]:
data = sc.textFile('hdfs://lattice-20:46780/test/lpsa.data')
parsedData = data.map(parsePoint)

In [8]:
parsedData.count()

67

In [9]:
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)



In [10]:
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
        .map(lambda v: (v[0] - v[1])**2) \
        .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 7.4510328101


In [11]:
# NOAA data test
import json

In [12]:
noaa_raw = sc.textFile('hdfs://lattice-20:46780/test/exported_*.json')

In [13]:
noaa_json=noaa_raw.map(lambda x: json.loads(x))

In [14]:
noaa_json.first()

{'categorical_snow_yes1_no0_surface': 0.0,
 'ice_cover_ice1_no_ice0_surface': 0.0,
 'lightning_surface': 0.0,
 'relative_humidity_zerodegc_isotherm': 14.0,
 'snow_depth_surface': 0.0,
 'upward_short_wave_rad_flux_surface': 0.0}

In [15]:
#ind_var = ('categorical_snow_yes1_no0_surface','ice_cover_ice1_no_ice0_surface', 'lightning_surface',
#           'snow_depth_surface', 'upward_short_wave_rad_flux_surface')
ind_var2 = ('snow_depth_surface', 'categorical_snow_yes1_no0_surface')
def parseJsonRecord(rec):
    vals = [float(rec[x]) for x in ind_var2]
    return LabeledPoint(rec['relative_humidity_zerodegc_isotherm'], vals)

In [16]:
noaa_input = noaa_json.map(parseJsonRecord)

In [17]:
rec_count = noaa_input.count()

In [18]:
training_rdd, test_rdd = noaa_input.randomSplit(weights=[0.7, 0.3], seed=123)

In [19]:
print(training_rdd.count())
print(test_rdd.count())

139672
60328


In [20]:
noaa_model = LinearRegressionWithSGD.train(training_rdd, iterations=100, step=0.00000001)



In [21]:
noaa_predic = test_rdd.map(lambda p: (p.label, noaa_model.predict(p.features)))

In [22]:
noaa_predic.first()

(14.0, 0.0)

In [23]:
MSE = noaa_predic \
        .map(lambda v: (v[0] - v[1])**2) \
        .reduce(lambda x, y: x + y) / noaa_predic.count()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 3665.71467336
