In [1]:
import sys
sys.path.append('..')
from lib import *
import pandas as pd
import numpy as np

forest = RNF(None, None, None, None, None, None, None)
forest.load_rnf('scen1.first600.64trees.10depth.42rs.20maxfeats.600maxin.pickle')

# Testing whole-forest importance

In [None]:
d = forest.get_feature_importances()

In [None]:
sorted([(d[key], key) for key in d.keys()], reverse=True)

# Testing individual predict

## Testing individual predict on fake forest

Forest consisting of two trees:
```
train_data = [.1,.99,1],[.3,.15,0],[.75,.1,0],[.9,.55,1]

        A<.5
       /    \
     B<.25  B<.5
     /  \   / \
    1   0  0   1
    
        B<.25
       /   \
      1     0
```

In [91]:
train_data_dummy = pd.DataFrame([[.1,.99,.75,'1'],[.3,.15,.5,'0'],[.75,.1,.4,'0'],[.9,.55,.1,'1']], columns=["A","B","C","Label"])

In [92]:
train_data_dummy

Unnamed: 0,A,B,C,Label
0,0.1,0.99,0.75,1
1,0.3,0.15,0.5,0
2,0.75,0.1,0.4,0
3,0.9,0.55,0.1,1


In [93]:
forest_dummy = RNF(train_data_dummy, 2, 2, 43, 2, 4, [])

In [94]:
forest_dummy.fit()

fitting the 1th tree.
fitting the 2th tree.


In [95]:
[t.features for t in forest_dummy.trees]

[array(['C', 'A'], dtype=object), array(['B', 'A'], dtype=object)]

In [96]:
print("{}<{}".format(forest_dummy.trees[1].head.min_feature, forest_dummy.trees[1].head.min_break_point))

B<0.55


In [97]:
train_data_dummy.loc[0,"Label"]

'1'

In [98]:
[train_data_dummy.loc[x,"Label"] for x in forest_dummy.trees[0].head.left.right.rows]

['1', '1']

```
   A<.9
   /    \
  C<.75  [1]
  / \
 [0] [1,1]
 
   B<.55
   /    \
 [0]     [1,1,1]
```

In [102]:
test_data_dummy = pd.DataFrame([[.95,.65,.3,'1'], [.1,.6,.4,'0']], columns=["A","B","C","Label"])

In [103]:
test_data_dummy

Unnamed: 0,A,B,C,Label
0,0.95,0.65,0.3,1
1,0.1,0.6,0.4,0


In [106]:
predictions_dummy = forest_dummy.predict_with_feat_imp(test_data_dummy)

In [107]:
predictions_dummy

([array([ 1.,  0.]), array([ 0.5,  0.5])],
 ['1', '0'],
 [{'A_high': 0.125, 'B_high': 0.125},
  {'A_low': -0.041666666666666685,
   'B_high': 0.125,
   'C_low': -0.3333333333333333}])

In [113]:
play_with = predictions_dummy[2][1]
print((play_with['A_low'] + play_with['C_low']) * 2)
print((play_with['B_high']) * 2)

-0.75
0.25


## Testing individual predict on real data

In [2]:
lsa_np = np.load('../data/parsed/lsa_output.npy')

metadata = pd.read_pickle('../data/parsed/pickles/pickled_data_test.pickle')
metadata = metadata.loc[metadata['Scenario'] == '401']
metadata = metadata.reset_index(drop=True)

lsa_df = pd.DataFrame(lsa_np)

df = pd.concat([metadata, lsa_df], axis=1, join_axes=[metadata.index])
df = df.loc[df['Label'] != '-1']
df = df.reset_index(drop=True)

cat_features = ['To','From']
features = list(range(100))
features.extend(cat_features + ['Date'])

df = df[features + ['Label', 'ID']]

test_df = df[600:].reset_index(drop=True)

In [3]:
predictions = forest.predict_with_feat_imp(test_df)

In [None]:
metadata.loc[metadata['ID']=='3.407765.PS41KHHMGYEA31NWW4S5W0SLOQLAU3WYA']['Message-Contents']

In [None]:
test_df.loc[1,'ID']

In [5]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,To,From,Date,Label,ID
0,0.093741,-0.126488,-0.065405,-0.004043,0.045035,-0.050174,-0.06459,0.004562,-0.055888,0.03465,...,-0.013066,0.010517,-0.001265,-0.07677,0.034415,"[dana.davis@enron.com, mike.carson@enron.com,...",[m..presto@enron.com],2001-09-28 08:12:58-07:00,0,3.407765.PS41KHHMGYEA31NWW4S5W0SLOQLAU3WYA
1,0.090775,-0.059612,-0.064171,0.011959,0.138535,0.034246,-0.023969,-0.045525,0.022183,0.05427,...,-0.002251,0.086652,-0.003936,0.00092,0.011898,[adam.johnson@enron.com],[m..presto@enron.com],2001-05-31 17:50:54-07:00,1,3.409003.ILYTPFB35EUPQ0GB5TZWNWSDGEKRNVOPA
2,0.096363,-0.090933,-0.030963,-0.010002,0.083191,-0.064575,-0.005209,0.03519,-0.057231,-0.031811,...,0.065175,0.028856,-0.014752,-0.025459,0.030946,[bryan.garrett@enron.com],[m..presto@enron.com],2001-06-14 11:13:42-07:00,0,3.409320.FHLHSXTTKLIOOBE35ASNSLLMDGHIVKOHB
3,0.233509,-0.282491,-0.005837,-0.064291,-0.048876,-0.016049,0.191338,0.451776,0.194354,0.289672,...,-0.006734,-0.021149,0.026081,0.016384,-0.021385,[joe.quenet@enron.com],[rebecca.quenet@eds.com],2001-10-22 10:44:43-07:00,0,3.273672.EOX4B3B2UPTKB4QXBY22HLFHGLAL3D3GA
4,0.098869,-0.104493,-0.062955,0.027021,0.083626,-0.074546,-0.072641,-0.109721,0.150989,0.04998,...,-0.048538,-0.078042,0.014199,-0.018502,0.049561,[orig.dl-eol@enron.com],[david.forster@enron.com],2002-01-11 16:03:53-08:00,1,3.306937.FKZCZAUZJJEVTK2QMSRY1BPRE1UCDVRHB


In [6]:
predictions[1]

['1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1']

In [13]:
doc_to_examine = 4
sorted([(predictions[2][doc_to_examine][key], key) for key in predictions[2][doc_to_examine].keys()], reverse=predictions[1][doc_to_examine]=='1')

[(0.0030398955759813155, '74_low'),
 (0.0025636834600656954, '82_low'),
 (0.0020039419756239605, '3_low'),
 (0.001918029884959923, '50_low'),
 (0.0018675938507064574, '19_low'),
 (0.0018610204795643723, '59_low'),
 (0.0015205381643968296, '34_low'),
 (0.0012352524552462574, '36_low'),
 (0.0011957949272733536, '93_low'),
 (0.001153266815479322, '20_low'),
 (0.0011394838616557096, 'Date_low'),
 (0.0011004612914014885, '66_low'),
 (0.001018006670097783, '60_low'),
 (0.0009687223541931999, '80_low'),
 (0.0009673333568589095, '49_low'),
 (0.0009171713467908803, '17_low'),
 (0.000869513225824365, '22_low'),
 (0.0008339467713016392, '11_low'),
 (0.0008283539683156919, '32_low'),
 (0.0008155226698578766, '2_low'),
 (0.0007864133267148025, '13_low'),
 (0.0007130121235558173, '42_low'),
 (0.0007113941537520846, '35_low'),
 (0.0006978306160016993, '96_low'),
 (0.0006839871919648976, '78_low'),
 (0.0006625941735492556, '67_low'),
 (0.0006299882094524013, '12_low'),
 (0.0006290787201151029, '55_low

In [14]:
import re


docstring = metadata.loc[metadata['ID'] == test_df.loc[doc_to_examine,'ID'], 'Message-Contents'].values[0]
important_words_70 = ['image','joe','deals','tracy','daily','rod','elizabeth','week','wordsmith','org','geaccone','parks','scott','hayslett','sager','email','deal','plan','good','office']
important_words_74 = ['letter','thank','need','feedback','pep','help','password','ca','access','isda','gas','eol','file','org','energy','like','rod','kevin','jpg','regards']
important_words_82 = ['today','tomorrow','credit','number','feedback','enronxgate','questions','price','pep','dan','contract','rod','comments','wordsmith','conference','time','november','ll','going','nymex']
important_words_3 = ['schedules','final','detected','variances','hourahead','hour','westdesk','parsing','awarded','txt','ancillary','iso','log','messages','scheduling','portland','start','file','california','schedule']
important_words_50 = ['gerald','hpl','nyiso','market','nemec','list','berkeley','teco','haas','tap','et','edu','mark','sellers','htm','meeting','power','prices','information','iferc']
for word in important_words_70:
    if word in docstring:
        print(word)
        docstring = docstring.replace(word, '***'+word+'***')
print('-----------')
print(docstring)

rod
week
plan
-----------
We are getting ready for the launch of "NetcoOnline" and have been preparing new documentation, a refreshed site design and cleaned p***rod***ucts to reflect our Gas and Power focus.

The following timetable could change, but the general sequence should apply in any case:

Week of Jan 14:

Upon approval by the new buyer of the PA and ETA formats, we will contact the Master Users at our Top 50 customers and send them copies of the final PA and ETA forms with our new buyer's name filled in. During this ***week***, we will be working hard to get as many of the PA's returned as possible. Once we have an approved PA, we can send the customer their new User ID, which will allow them to start trading on the day we launch "NetCoOnline".

A Splash page will be available within two business days of knowing our new company name, on a web page reached via an automatic redirect from www.enrononline.com. This page will tell customers about the pending launch of "NetCoOnline

In [None]:
metadata.loc[metadata['ID'] == '3.438120.GTSTTLNTZ2LVIWQFALPJFBLRPP2UC0G4', 'Message-Contents']