## Get static program features of bowtie and cd-hit-dup

In [1]:
header='ft1 ft2 ft3 ft4 ft5 ft6 ft7 ft8 ft9 ft10 ft11 ft12 ft13 ft14 ft15 ft16 ft17 ft18 ft24 ft25 ft19 ft39 ft20 ft33 ft21 ft35 ft22 ft23 ft34 ft36 ft37 ft38 ft40 ft41 ft42 ft43 ft44 ft45 ft46 ft48 ft47 ft49 ft51 ft50 ft52 ft53 ft54 ft55 ft26 ft27 ft28 ft29 ft30 ft31 ft32'

In [2]:
import pandas as pd

def value(item):
    return item[item.find('=')+1:]

df = pd.read_table('cd-hit-dup_features.txt', header=None, delimiter=',',
                   converters={i:value for i in range(55)},
                   names=header.split())

bowtie_ft = pd.read_table('bowtie_features.txt', header=None, delimiter=',',
                   converters={i:value for i in range(55)},
                   names=header.split())

bowtie_ft = bowtie_ft.astype(float)
bowtie_ft

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,4267.0,1648.0,2153.0,15.0,2843.0,847.0,357.0,1291.0,1403.0,245.0,...,219.0,3626.0,18.0,79.11,210.87,595.0,529.0,2923.0,9.0,1115.0


In [3]:
df = df.astype(float)

In [4]:
df

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,341.0,106.0,94.0,0.0,168.0,68.0,11.0,94.0,46.0,9.0,...,1.0,160.0,0.0,19.89,51.2,55.0,20.0,172.0,5.0,70.0
1,415.0,154.0,164.0,1.0,242.0,75.0,34.0,126.0,107.0,26.0,...,5.0,214.0,0.0,11.95,42.52,84.0,11.0,256.0,14.0,81.0
2,352.0,142.0,172.0,0.0,213.0,103.0,18.0,97.0,108.0,40.0,...,3.0,267.0,1.0,9.27,19.65,61.0,51.0,222.0,2.0,110.0
3,1950.0,797.0,945.0,0.0,1218.0,518.0,86.0,588.0,580.0,179.0,...,65.0,1497.0,2.0,42.99,83.77,502.0,143.0,1177.0,7.0,638.0


In [5]:
rowsum = df.sum(axis=0)

In [6]:
data = [d for d in rowsum.values]
cols = [c for c in rowsum.index]

In [7]:
dup_ft = pd.DataFrame(columns=cols) #create empty dataframe

In [8]:
dup_ft.loc[0] = data #add one row

In [9]:
dup_ft

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,841.0,254.0,...,74.0,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0


## Bowtie: DF1 = Append to CPU, DataSize
## DUP:      DF2 = Append to CPU, DataSize
## Add totaltime column to both DF1, DF2
## DF1 U DF2
## Model Training to predict totaltime (57 features)

In [10]:
# read DUP ppn, size, Ytime from csv
dup_ppn_size_time = pd.read_csv('22July_dup_ppn_sizeGB_Ytime.csv')
del dup_ppn_size_time['Unnamed: 0']
dup_ppn_size_time.head(2)

Unnamed: 0,ppn,sizeGB,Y_time
0,11,1.2,45.82
1,11,1.7,67.97


In [11]:
rows = dup_ppn_size_time.shape[0]
rows

32

In [12]:
# create dataframe from DUP static program features with same #ROWs
frames = [dup_ft for i in range(rows)]
DUP_program_ft = pd.concat(frames)
DUP_program_ft.reset_index(inplace=True)
del DUP_program_ft['index']
DUP_program_ft.shape

(32, 55)

In [13]:
# read BOWTIE ppn, size, YTime from csv
bowtie_ppn_size_time = pd.read_csv('22July_BOWTIE_ppn_sizeGB_Ytime.csv')
del bowtie_ppn_size_time['Unnamed: 0']
bowtie_ppn_size_time.head(2)

Unnamed: 0,ppn,sizeGB,Y_time
0,11,0.56,841.52
1,11,0.56,828.67


In [14]:
rows = bowtie_ppn_size_time.shape[0]
rows

64

In [15]:
# create dataframe from BOWTIE static program features with same #ROWS
frames = [bowtie_ft for i in range(rows)]
BOWTIE_program_ft = pd.concat(frames)
BOWTIE_program_ft.reset_index(inplace=True)
del BOWTIE_program_ft['index']
BOWTIE_program_ft.shape

(64, 55)

In [16]:
DUP_ft_concat = pd.concat([dup_ppn_size_time, DUP_program_ft], axis=1, join_axes=[dup_ppn_size_time.index])
DUP_ft_concat['y_time']=DUP_ft_concat['Y_time']
del DUP_ft_concat['Y_time']
DUP_ft_concat.shape

(32, 58)

In [17]:
BOWTIE_ft_concat = pd.concat([bowtie_ppn_size_time, BOWTIE_program_ft], axis=1, 
                            join_axes=[bowtie_ppn_size_time.index])

BOWTIE_ft_concat['y_time']=BOWTIE_ft_concat['Y_time']
del BOWTIE_ft_concat['Y_time']
BOWTIE_ft_concat.shape

(64, 58)

In [18]:
# union DUP_ft_concat, BOWTIE_ft_concat
frames = [DUP_ft_concat, BOWTIE_ft_concat]
dup_bowtie = pd.concat(frames)
dup_bowtie.reset_index(inplace=True)
del dup_bowtie['index']
dup_bowtie.head(2)

Unnamed: 0,ppn,sizeGB,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,...,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32,y_time
0,11,1.2,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,...,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0,45.82
1,11,1.7,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,...,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0,67.97


In [19]:
dup_bowtie.columns[0:57]

Index(['ppn', 'sizeGB', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
       'ft9', 'ft10', 'ft11', 'ft12', 'ft13', 'ft14', 'ft15', 'ft16', 'ft17',
       'ft18', 'ft24', 'ft25', 'ft19', 'ft39', 'ft20', 'ft33', 'ft21', 'ft35',
       'ft22', 'ft23', 'ft34', 'ft36', 'ft37', 'ft38', 'ft40', 'ft41', 'ft42',
       'ft43', 'ft44', 'ft45', 'ft46', 'ft48', 'ft47', 'ft49', 'ft51', 'ft50',
       'ft52', 'ft53', 'ft54', 'ft55', 'ft26', 'ft27', 'ft28', 'ft29', 'ft30',
       'ft31', 'ft32'],
      dtype='object')

In [20]:
ft0 = dup_bowtie.copy(deep=True)

In [21]:
import numpy as np
ft1=ft0.iloc[np.random.permutation(len(ft0))]
ft=ft1.reset_index(drop=True)
ft.shape

(96, 58)

### ft_all

In [22]:
ft_all = ft.copy(deep=True) #store a copy - do not change this later

In [23]:
features = ft.columns[0:57]
target = ft.columns[57]
features

Index(['ppn', 'sizeGB', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
       'ft9', 'ft10', 'ft11', 'ft12', 'ft13', 'ft14', 'ft15', 'ft16', 'ft17',
       'ft18', 'ft24', 'ft25', 'ft19', 'ft39', 'ft20', 'ft33', 'ft21', 'ft35',
       'ft22', 'ft23', 'ft34', 'ft36', 'ft37', 'ft38', 'ft40', 'ft41', 'ft42',
       'ft43', 'ft44', 'ft45', 'ft46', 'ft48', 'ft47', 'ft49', 'ft51', 'ft50',
       'ft52', 'ft53', 'ft54', 'ft55', 'ft26', 'ft27', 'ft28', 'ft29', 'ft30',
       'ft31', 'ft32'],
      dtype='object')

In [24]:
target

'y_time'

In [31]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation

model = RandomForestRegressor(n_estimators=3000, n_jobs=-1, oob_score=True)
rmse = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i=0
t = 150

while (rmse.mean() > t):
    scoresMSE = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse = np.sqrt(-scoresMSE)
    i=i+1
    if (i%15==0):
        print(i, t, rmse, rmse.mean())
        t = t + 10
        
        #shuffle
        ft1  =ft_all.iloc[np.random.permutation(len(ft_all))]
        ft2  =ft1.reset_index(drop=True)
        ft_all = ft2.copy(deep=True)

15 150 [ 810.12038316   95.6302104    81.61226999  397.58895212  447.03812134] 366.397987401
30 160 [   54.46909249   307.76398729   281.50043095   249.0628519   1082.2154795 ] 395.002368427
45 170 [ 164.83091449  309.3276651   275.99198328  317.75342574  132.89381487] 240.159560696
60 180 [ 228.4219548   307.84691136  255.02689114   65.80560127  330.43677383] 237.507626482
75 190 [ 215.13791915  280.37359884  129.43599669  348.18326483  128.37720007] 220.301595915


In [36]:
rmse.mean(), ft_all[target].mean()

(193.86163685668643, 1389.7422916666667)

In [37]:
rmse

array([ 315.50889632,  160.4556462 ,   96.88628657,  360.62298207,
         35.83437313])

In [38]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation

model2 = RandomForestRegressor(n_estimators=3000, n_jobs=-1, oob_score=True)
rmse2 = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i=0
t = 150

while (rmse2.mean() > t):
    scoresMSE2 = cross_validation.cross_val_score(model2, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse2 = np.sqrt(-scoresMSE2)
    i=i+1
    if (i%50==0):
        print(i, t, rmse2, rmse2.mean())
        t = t + 10
        
        #shuffle
        ft1  =ft_all.iloc[np.random.permutation(len(ft_all))]
        ft2  =ft1.reset_index(drop=True)
        ft_all = ft2.copy(deep=True)

50 150 [ 325.33207748  160.25886202   99.00764685  370.37591484   35.22679165] 198.040258568
100 160 [ 801.81354557   99.26344748  280.67636937  409.8515728    94.24411086] 337.169809218
150 170 [  169.31288194   330.69467237    74.14084072  1096.24676066    27.61107244] 339.601245625
200 180 [ 1061.60601848   234.77644589   180.8074913    251.60246642    22.80487831] 350.319460079
250 190 [ 1051.98614453   297.24912943   319.08799148   130.78198478   235.29204788] 406.879459619
300 200 [ 1055.18230441   231.62893436   211.28613647   112.11429128   217.09493214] 365.461319732
350 210 [ 152.64680223  241.4765278   309.61115097  425.90451172  132.65750554] 252.459299653
400 220 [ 170.99272275  256.76410983  303.93104758   84.2064806   347.90211162] 232.759294476
450 230 [ 366.35091439  312.58961125  135.82212934  296.53342151  269.96157078] 276.251529454
500 240 [ 345.48064336  570.0648403   279.7545394    82.95986906  256.17051602] 306.886081629


In [39]:
rmse2

array([ 295.0124828 ,  120.50976346,   93.13358771,  259.61305756,
        350.73711393])

In [40]:
rmse2.mean()

223.80120109331148

In [31]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np
from sklearn import cross_validation
from numpy.random import RandomState

rmse3 = np.array([  297.59290466,  1367.76369731,   164.79490009,    33.22493526, 389.47117031])

i = 0
t = 150

while (rmse3.mean() > t):
    np.random.seed() #randomly initialize
    
    model = RandomForestRegressor(n_estimators=6000, n_jobs=-1, oob_score=True, random_state=RandomState())    
    
    #shuffle
    ft1  =ft_all.iloc[np.random.permutation(len(ft_all))]
    ft2  =ft1.reset_index(drop=True)
    ft_all = ft2.copy(deep=True)
        
    scoresMSE3 = cross_validation.cross_val_score(model, ft_all[features], ft_all[target], cv=5,scoring='mean_squared_error')
    rmse3 = np.sqrt(-scoresMSE3)
    i=i+1
    if (i%15==0):
        print(i, t, rmse3, rmse3.mean())
        t = t + 10

15 150 [  473.88530326    81.76295899    19.59024196  1085.66686014   240.59800096] 380.300673061
30 160 [  133.44418997    71.38830606  1084.38285289    75.45528827   600.74013942] 393.082155322
45 170 [ 335.03866809   87.79320712  156.15266749  328.17757898  220.10973952] 225.45437224
60 180 [ 292.90910666   70.48572501  291.78380781  392.51820657  257.63995896] 261.067361002
75 190 [  117.8783393   1084.70348882   104.91099823   145.83886909   116.98385864] 314.063110819
90 200 [ 360.77605274  231.73246086  292.51379501  537.64898214   92.78454555] 303.091167257


In [32]:
rmse3, rmse3.mean()

(array([ 343.06205591,   46.80086024,  337.6488976 ,  142.9993495 ,
          79.75583334]), 190.0533993186931)

In [33]:
ft_all[target].mean()

1389.7422916666667