## Get static program features of bowtie and cd-hit-dup

In [1]:
header='ft1 ft2 ft3 ft4 ft5 ft6 ft7 ft8 ft9 ft10 ft11 ft12 ft13 ft14 ft15 ft16 ft17 ft18 ft24 ft25 ft19 ft39 ft20 ft33 ft21 ft35 ft22 ft23 ft34 ft36 ft37 ft38 ft40 ft41 ft42 ft43 ft44 ft45 ft46 ft48 ft47 ft49 ft51 ft50 ft52 ft53 ft54 ft55 ft26 ft27 ft28 ft29 ft30 ft31 ft32'

In [2]:
import pandas as pd

def value(item):
    return item[item.find('=')+1:]

df = pd.read_table('cd-hit-dup_features.txt', header=None, delimiter=',',
                   converters={i:value for i in range(55)},
                   names=header.split())

bowtie_ft = pd.read_table('bowtie_features.txt', header=None, delimiter=',',
                   converters={i:value for i in range(55)},
                   names=header.split())

bowtie_ft = bowtie_ft.astype(float)
bowtie_ft

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,4267.0,1648.0,2153.0,15.0,2843.0,847.0,357.0,1291.0,1403.0,245.0,...,219.0,3626.0,18.0,79.11,210.87,595.0,529.0,2923.0,9.0,1115.0


In [3]:
df = df.astype(float)

In [4]:
df

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,341.0,106.0,94.0,0.0,168.0,68.0,11.0,94.0,46.0,9.0,...,1.0,160.0,0.0,19.89,51.2,55.0,20.0,172.0,5.0,70.0
1,415.0,154.0,164.0,1.0,242.0,75.0,34.0,126.0,107.0,26.0,...,5.0,214.0,0.0,11.95,42.52,84.0,11.0,256.0,14.0,81.0
2,352.0,142.0,172.0,0.0,213.0,103.0,18.0,97.0,108.0,40.0,...,3.0,267.0,1.0,9.27,19.65,61.0,51.0,222.0,2.0,110.0
3,1950.0,797.0,945.0,0.0,1218.0,518.0,86.0,588.0,580.0,179.0,...,65.0,1497.0,2.0,42.99,83.77,502.0,143.0,1177.0,7.0,638.0


In [5]:
rowsum = df.sum(axis=0)

In [6]:
data = [d for d in rowsum.values]
cols = [c for c in rowsum.index]

In [7]:
dup_ft = pd.DataFrame(columns=cols) #create empty dataframe

In [8]:
dup_ft.loc[0] = data #add one row

In [9]:
dup_ft

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,841.0,254.0,...,74.0,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0


## Bowtie: DF1 = Append to CPU, DataSize
## DUP:      DF2 = Append to CPU, DataSize
## Add totaltime column to both DF1, DF2
## DF1 U DF2
## Model Training to predict totaltime (57 features)

In [10]:
# read DUP ppn, size, Ytime from csv
dup_ppn_size_time = pd.read_csv('22July_dup_ppn_sizeGB_Ytime.csv')
del dup_ppn_size_time['Unnamed: 0']
dup_ppn_size_time.head(2)

Unnamed: 0,ppn,sizeGB,Y_time
0,11,1.2,45.82
1,11,1.7,67.97


In [11]:
rows = dup_ppn_size_time.shape[0]
rows

32

In [12]:
# create dataframe from DUP static program features with same #ROWs
frames = [dup_ft for i in range(rows)]
DUP_program_ft = pd.concat(frames)
DUP_program_ft.reset_index(inplace=True)
del DUP_program_ft['index']
DUP_program_ft.shape

(32, 55)

In [13]:
# read BOWTIE ppn, size, YTime from csv
bowtie_ppn_size_time = pd.read_csv('22July_BOWTIE_ppn_sizeGB_Ytime.csv')
del bowtie_ppn_size_time['Unnamed: 0']
bowtie_ppn_size_time.head(2)

Unnamed: 0,ppn,sizeGB,Y_time
0,11,0.56,841.52
1,11,0.56,828.67


In [14]:
rows = bowtie_ppn_size_time.shape[0]
rows

64

In [15]:
# create dataframe from BOWTIE static program features with same #ROWS
frames = [bowtie_ft for i in range(rows)]
BOWTIE_program_ft = pd.concat(frames)
BOWTIE_program_ft.reset_index(inplace=True)
del BOWTIE_program_ft['index']
BOWTIE_program_ft.shape

(64, 55)

In [16]:
DUP_ft_concat = pd.concat([dup_ppn_size_time, DUP_program_ft], axis=1, join_axes=[dup_ppn_size_time.index])
DUP_ft_concat['y_time']=DUP_ft_concat['Y_time']
del DUP_ft_concat['Y_time']
DUP_ft_concat.shape

(32, 58)

In [17]:
BOWTIE_ft_concat = pd.concat([bowtie_ppn_size_time, BOWTIE_program_ft], axis=1, 
                            join_axes=[bowtie_ppn_size_time.index])

BOWTIE_ft_concat['y_time']=BOWTIE_ft_concat['Y_time']
del BOWTIE_ft_concat['Y_time']
BOWTIE_ft_concat.shape

(64, 58)

In [18]:
# union DUP_ft_concat, BOWTIE_ft_concat
frames = [DUP_ft_concat, BOWTIE_ft_concat]
dup_bowtie = pd.concat(frames)
dup_bowtie.reset_index(inplace=True)
del dup_bowtie['index']
dup_bowtie.head(2)

Unnamed: 0,ppn,sizeGB,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,...,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32,y_time
0,11,1.2,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,...,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0,45.82
1,11,1.7,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,...,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0,67.97


In [19]:
dup_bowtie.columns[0:57]

Index(['ppn', 'sizeGB', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
       'ft9', 'ft10', 'ft11', 'ft12', 'ft13', 'ft14', 'ft15', 'ft16', 'ft17',
       'ft18', 'ft24', 'ft25', 'ft19', 'ft39', 'ft20', 'ft33', 'ft21', 'ft35',
       'ft22', 'ft23', 'ft34', 'ft36', 'ft37', 'ft38', 'ft40', 'ft41', 'ft42',
       'ft43', 'ft44', 'ft45', 'ft46', 'ft48', 'ft47', 'ft49', 'ft51', 'ft50',
       'ft52', 'ft53', 'ft54', 'ft55', 'ft26', 'ft27', 'ft28', 'ft29', 'ft30',
       'ft31', 'ft32'],
      dtype='object')

In [20]:
ft0 = dup_bowtie.copy(deep=True)

In [21]:
import numpy as np
ft1=ft0.iloc[np.random.permutation(len(ft0))]
ft=ft1.reset_index(drop=True)
ft.shape

(96, 58)

### ft_all

In [22]:
ft_all = ft.copy(deep=True) #store a copy - do not change this later

In [23]:
features = ft.columns[0:57]
target = ft.columns[57]
features

Index(['ppn', 'sizeGB', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
       'ft9', 'ft10', 'ft11', 'ft12', 'ft13', 'ft14', 'ft15', 'ft16', 'ft17',
       'ft18', 'ft24', 'ft25', 'ft19', 'ft39', 'ft20', 'ft33', 'ft21', 'ft35',
       'ft22', 'ft23', 'ft34', 'ft36', 'ft37', 'ft38', 'ft40', 'ft41', 'ft42',
       'ft43', 'ft44', 'ft45', 'ft46', 'ft48', 'ft47', 'ft49', 'ft51', 'ft50',
       'ft52', 'ft53', 'ft54', 'ft55', 'ft26', 'ft27', 'ft28', 'ft29', 'ft30',
       'ft31', 'ft32'],
      dtype='object')

In [24]:
target

'y_time'

In [25]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation

model = RandomForestRegressor(n_estimators=1500, n_jobs=-1)
rmse = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i=0
while (rmse.mean() > 175):
    scoresMSE = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse = np.sqrt(-scoresMSE)
    i=i+1
    if (i%100==0):
        print i

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-25-8c25636fed93>, line 14)

In [30]:
rmse.mean()


244.88048933110252

In [32]:
rmse

array([  29.68628167,  261.51468058,  308.89622362,  286.50767671,
        337.79758408])

In [36]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation

model = RandomForestRegressor(n_estimators=3000, n_jobs=-1)
rmse = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i=0
t = 150

while (rmse.mean() > t):
    scoresMSE = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse = np.sqrt(-scoresMSE)
    i=i+1
    if (i%4==0):
        print(i, t)
        t = t + 15

4 150
8 165
12 180
16 195
20 210
24 225
28 240


In [37]:
rmse.mean()

243.58065595557645

In [38]:
rmse

array([  27.76887049,  262.02043021,  294.08722985,  299.18960044,
        334.83714878])

In [41]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation

model = RandomForestRegressor(n_estimators=3000, n_jobs=-1)
rmse = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i=0
t = 150

while (rmse.mean() > t):
    #shuffle
    ft1  =ft_all.iloc[np.random.permutation(len(ft_all))]
    ft2  =ft1.reset_index(drop=True)
    ft_all = ft2.copy(deep=True)
    
    scoresMSE = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse = np.sqrt(-scoresMSE)
    i=i+1
    
    if (i%10==0):
        print(i, t, rmse)
        t = t + 10

10 150 [ 212.12756407  268.13847117  313.45873095   77.26900547  299.58019251]
20 160 [ 711.07764465  264.29868959  171.42333532  143.92965824   54.75061631]
30 170 [ 234.10047799  610.35237841  309.40218011   50.03503771  280.60003581]
40 180 [ 315.76773624  119.68425449  161.46130039  327.49612894  225.39668151]
50 190 [  225.85806273   435.43196329  1088.25845117    67.84286167   228.49646437]
60 200 [  253.78530296   125.7434684    608.02822981   222.11689018  1082.20028704]


Exception in thread Thread-117804:
Traceback (most recent call last):
  File "/data/home/a1singh/anaconda3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/data/home/a1singh/anaconda3/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/data/home/a1singh/anaconda3/lib/python3.5/multiprocessing/pool.py", line 445, in _handle_results
    cache[job]._set(i, obj)
  File "/data/home/a1singh/anaconda3/lib/python3.5/multiprocessing/pool.py", line 613, in _set
    self._callback(self._value)
  File "/data/home/a1singh/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 224, in __call__
    self.parallel.dispatch_next()
  File "/data/home/a1singh/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 592, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/data/home/a1singh/anaconda3/lib/python3.5/site-packages/sklearn/exte

KeyboardInterrupt: 

In [44]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation

model = RandomForestRegressor(n_estimators=3000, n_jobs=-1, oob_score=True)
rmse = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i=0
t = 150

while (rmse.mean() > t):
    scoresMSE = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse = np.sqrt(-scoresMSE)
    i=i+1
    
    if (i%100==0):
        print(i, t, rmse, rmse.mean())
        t = t + 10
        
        #shuffle
        ft1  =ft_all.iloc[np.random.permutation(len(ft_all))]
        ft2  =ft1.reset_index(drop=True)
        ft_all = ft2.copy(deep=True)

100 150 [ 312.79991495  146.10953306  268.40379207  218.33896932  327.9671775 ] 254.72387738
200 160 [ 284.87499507  277.49094813  316.84825633  186.06220894  212.60389078] 255.57605985
300 170 [  74.94879852   98.295299     80.89735038  321.07505205  584.6771049 ] 231.97872097
400 180 [  163.18525782    97.53691885   249.18454574  1096.94546058   221.17635339] 365.605707273
500 190 [ 194.92131381  228.54892607  374.5793043   347.99384698   78.2206241 ] 244.852803054


KeyboardInterrupt: 

In [None]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation
from numpy.random import RandomState

rmse3 = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i = 0
t = 150

while (rmse3.mean() > t):
    np.random.seed() #randomly initialize
    model = RandomForestRegressor(n_estimators=6000, n_jobs=-1, oob_score=True, random_state=RandomState())
    
    #shuffle
    ft1  =ft_all.iloc[np.random.permutation(len(ft_all))]
    ft2  =ft1.reset_index(drop=True)
    ft_all = ft2.copy(deep=True)
    
    scoresMSE3 = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse3 = np.sqrt(-scoresMSE3)
    i=i+1
    if (i%100==0):
        print(i, t, rmse3, rmse3.mean())
        t = t + 10

In [None]:
rmse3, rmse3.mean()

In [None]:
ft_all[target].mean()