In [1]:
import trustedanalytics as ta
ta.connect()

Connected.  This client instance connected to server http://localhost:9099/v1 (version=TheReneNumber) as user test_api_key_1 at 2016-02-24 10:39:47.791126.


### Create a frame with data that we'll use to train the ARX model

The frame has columns for the observed value "y" and several other columns that contain exogenous variables (visitors, weekends, seasonality, etc).


In [2]:
schema = [("y", ta.float64),("visitors", ta.float64),("wkends", ta.float64),("seasonality", ta.float64),("incidentRate", ta.float64), ("holidayFlag", ta.float64),("postHolidayFlag", ta.float64),("mintemp", ta.float64)]
csv = ta.CsvFile("train_atk.csv", schema=schema, skip_header_lines=1)
frame = ta.Frame(csv)



In [3]:
frame.inspect()

[#]  y      visitors  wkends  seasonality  incidentRate  holidayFlag
[0]   93.0     416.0     0.0  0.006103106          28.0          0.0
[1]   82.0     393.0     0.0  0.005381233          28.0          0.0
[2]  109.0     444.0     0.0  0.007153103          28.0          0.0
[3]  110.0     445.0     0.0  0.007218727          28.0          0.0
[4]  109.0     426.0     1.0  0.007153103          28.0          0.0
[5]   84.0     435.0     1.0  0.005512483          28.0          0.0
[6]  100.0     471.0     0.0  0.006562479          29.0          0.0
[7]   91.0     397.0     0.0  0.005971856          29.0          0.0
[8]  119.0     454.0     0.0  0.007809351          29.0          0.0
[9]   78.0     416.0     0.0  0.005118734          29.0          0.0

[#]  postHolidayFlag  mintemp
[0]              0.0     55.0
[1]              0.0     57.0
[2]              0.0     53.0
[3]              0.0     55.0
[4]              0.0     57.0
[5]              0.0     50.0
[6]              0.0     50.0


### Create and train the model

Create an ARX model, and then train the model by providing the frame of data, the "y" column, a list of "x" columns, y max lag, x max lag, and a boolean flag indicating if the intercept should be dropped.

The ARX model train() return 'c' (an intercept term, or 0 for no intercept) and a list of coefficients (one for each "x" column).  


In [4]:
arx = ta.ArxModel()
y_column = "y"
x_columns = ["visitors","wkends","seasonality","incidentRate","holidayFlag","postHolidayFlag","mintemp"]
y_max_lag = 0
x_max_lag = 0
no_intercept = True
arx.train(frame, y_column, x_columns, y_max_lag, x_max_lag, no_intercept)



{u'c': 0.0,
 u'coefficients': [-1.136026484226831e-08,
  8.637677568908233e-07,
  15238.143039368977,
  -7.993535860373772e-09,
  -5.198597570089805e-07,
  1.5691547009557947e-08,
  7.409621376205488e-08]}

So, in this example the coefficients are:

| x              | coefficient            |
|----------------|------------------------|
| vistors        | -1.136026484226831e-08 |
|wkends          |  8.637677568908233e-07 |
|seasonality     |  15238.143039368977    |
|incidentRate    | -7.993535860373772e-09 |
|holidayFlag     | -5.198597570089805e-07 |
|postHolidayFlag |  1.5691547009557947e-08|
|mintemp:        |  7.409621376205488e-08 |


### Create a frame that contains test data

The test data is in test_atk.csv and has the same schema that we used for training.

In [5]:
test_csv = ta.CsvFile("test_atk.csv", schema=schema, skip_header_lines=1)
test_frame = ta.Frame(test_csv)
test_frame.inspect()



[#]  y      visitors  wkends  seasonality  incidentRate  holidayFlag
[0]  100.0     465.0     1.0  0.006562479          24.0          1.0
[1]   98.0     453.0     1.0   0.00643123          24.0          0.0
[2]  102.0     472.0     0.0  0.006693729          25.0          0.0
[3]   98.0     454.0     0.0   0.00643123          25.0          0.0
[4]  112.0     432.0     0.0  0.007349977          25.0          0.0
[5]   99.0     431.0     0.0  0.006496855          25.0          0.0
[6]   99.0     475.0     0.0  0.006496855          25.0          0.0
[7]   87.0     393.0     1.0  0.005709357          25.0          0.0
[8]  103.0     437.0     1.0  0.006759354          25.0          0.0
[9]  115.0     537.0     0.0  0.007546851          23.0          0.0

[#]  postHolidayFlag  mintemp
[0]              0.0     51.0
[1]              1.0     54.0
[2]              0.0     49.0
[3]              0.0     46.0
[4]              0.0     42.0
[5]              0.0     41.0
[6]              0.0     45.0


### Predict

Using the frame of test data, run ARX predict().

In [6]:
p = arx.predict(test_frame, y_column, x_columns)
p.inspect(n=p.row_count,columns=["y", "predicted_y"])



[##]  y      predicted_y  
[0]   100.0  99.9999923433
[1]    98.0  98.0000022017
[2]   102.0  101.999998038
[3]    98.0  98.0000007101
[4]   112.0  111.999998867
[5]    99.0  99.0000037379
[6]    99.0  99.0000035344
[7]    87.0  99.9999923433
[8]   103.0  98.0000022017
[9]   115.0  101.999998038
[10]  101.0  98.0000007101
[11]  125.0  111.999998867
[12]  117.0  99.0000037379
[13]  109.0  99.0000035344
[14]  111.0  86.9999982366
[15]  105.0  103.000002362

In [7]:
arx.publish()



{u'category': u'model',
 u'dataSample': u'',
 u'format': u'tar',
 u'isPublic': False,
 u'recordCount': 0,
 u'size': 214773760,
 u'sourceUri': u'hdfs://ATK-WSA:8020/user/atkuser/models_ffdf82616a0144b7938044ad37c3c276.tar',
 u'targetUri': u'hdfs://ATK-WSA:8020/user/atkuser/models_ffdf82616a0144b7938044ad37c3c276.tar',
 u'title': u'arx_model'}