## Get static program features of bowtie and cd-hit-dup

In [1]:
header='ft1 ft2 ft3 ft4 ft5 ft6 ft7 ft8 ft9 ft10 ft11 ft12 ft13 ft14 ft15 ft16 ft17 ft18 ft24 ft25 ft19 ft39 ft20 ft33 ft21 ft35 ft22 ft23 ft34 ft36 ft37 ft38 ft40 ft41 ft42 ft43 ft44 ft45 ft46 ft48 ft47 ft49 ft51 ft50 ft52 ft53 ft54 ft55 ft26 ft27 ft28 ft29 ft30 ft31 ft32'

In [11]:
import pandas as pd

def value(item):
    return item[item.find('=')+1:]

df = pd.read_table('cd-hit-dup_features.txt', header=None, delimiter=',',
                   converters={i:value for i in range(55)},
                   names=header.split())

bowtie_ft = pd.read_table('bowtie_features.txt', header=None, delimiter=',',
                   converters={i:value for i in range(55)},
                   names=header.split())

bowtie_ft = bowtie_ft.astype(float)
bowtie_ft

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,4267.0,1648.0,2153.0,15.0,2843.0,847.0,357.0,1291.0,1403.0,245.0,...,219.0,3626.0,18.0,79.11,210.87,595.0,529.0,2923.0,9.0,1115.0


In [12]:
df = df.astype(float)

In [13]:
df

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,341.0,106.0,94.0,0.0,168.0,68.0,11.0,94.0,46.0,9.0,...,1.0,160.0,0.0,19.89,51.2,55.0,20.0,172.0,5.0,70.0
1,415.0,154.0,164.0,1.0,242.0,75.0,34.0,126.0,107.0,26.0,...,5.0,214.0,0.0,11.95,42.52,84.0,11.0,256.0,14.0,81.0
2,352.0,142.0,172.0,0.0,213.0,103.0,18.0,97.0,108.0,40.0,...,3.0,267.0,1.0,9.27,19.65,61.0,51.0,222.0,2.0,110.0
3,1950.0,797.0,945.0,0.0,1218.0,518.0,86.0,588.0,580.0,179.0,...,65.0,1497.0,2.0,42.99,83.77,502.0,143.0,1177.0,7.0,638.0


In [14]:
rowsum = df.sum(axis=0)

In [15]:
data = [d for d in rowsum.values]
cols = [c for c in rowsum.index]

In [16]:
dup_ft = pd.DataFrame(columns=cols) #create empty dataframe

In [17]:
dup_ft.loc[0] = data #add one row

In [18]:
dup_ft

Unnamed: 0,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,ft9,ft10,...,ft53,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32
0,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,841.0,254.0,...,74.0,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0


## Bowtie: DF1 = Append to CPU, DataSize
## DUP:      DF2 = Append to CPU, DataSize
## Add totaltime column to both DF1, DF2
## DF1 U DF2
## Model Training to predict totaltime (57 features)

In [56]:
# read DUP ppn, size, Ytime from csv
dup_ppn_size_time = pd.read_csv('22July_dup_ppn_sizeGB_Ytime.csv')
del dup_ppn_size_time['Unnamed: 0']
dup_ppn_size_time.head(2)

Unnamed: 0,ppn,sizeGB,Y_time
0,11,1.2,45.82
1,11,1.7,67.97


In [57]:
rows = dup_ppn_size_time.shape[0]
rows

32

In [58]:
# create dataframe from DUP static program features with same #ROWs
frames = [dup_ft for i in range(rows)]
DUP_program_ft = pd.concat(frames)
DUP_program_ft.reset_index(inplace=True)
del DUP_program_ft['index']
DUP_program_ft.shape

(32, 55)

In [59]:
# read BOWTIE ppn, size, YTime from csv
bowtie_ppn_size_time = pd.read_csv('22July_BOWTIE_ppn_sizeGB_Ytime.csv')
del bowtie_ppn_size_time['Unnamed: 0']
bowtie_ppn_size_time.head(2)

Unnamed: 0,ppn,sizeGB,Y_time
0,11,0.56,841.52
1,11,0.56,828.67


In [60]:
rows = bowtie_ppn_size_time.shape[0]
rows

64

In [66]:
# create dataframe from BOWTIE static program features with same #ROWS
frames = [bowtie_ft for i in range(rows)]
BOWTIE_program_ft = pd.concat(frames)
BOWTIE_program_ft.reset_index(inplace=True)
del BOWTIE_program_ft['index']
BOWTIE_program_ft.shape

(64, 55)

In [78]:
DUP_ft_concat = pd.concat([dup_ppn_size_time, DUP_program_ft], axis=1, join_axes=[dup_ppn_size_time.index])
DUP_ft_concat['y_time']=DUP_ft_concat['Y_time']
del DUP_ft_concat['Y_time']
DUP_ft_concat.shape

(32, 58)

In [79]:
BOWTIE_ft_concat = pd.concat([bowtie_ppn_size_time, BOWTIE_program_ft], axis=1, 
                            join_axes=[bowtie_ppn_size_time.index])

BOWTIE_ft_concat['y_time']=BOWTIE_ft_concat['Y_time']
del BOWTIE_ft_concat['Y_time']
BOWTIE_ft_concat.shape

(64, 58)

In [83]:
# union DUP_ft_concat, BOWTIE_ft_concat
frames = [DUP_ft_concat, BOWTIE_ft_concat]
dup_bowtie = pd.concat(frames)
dup_bowtie.reset_index(inplace=True)
del dup_bowtie['index']
dup_bowtie.head(2)

Unnamed: 0,ppn,sizeGB,ft1,ft2,ft3,ft4,ft5,ft6,ft7,ft8,...,ft54,ft55,ft26,ft27,ft28,ft29,ft30,ft31,ft32,y_time
0,11,1.2,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,...,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0,45.82
1,11,1.7,3058.0,1199.0,1375.0,1.0,1841.0,764.0,149.0,905.0,...,2138.0,3.0,84.1,197.14,702.0,225.0,1827.0,28.0,899.0,67.97


In [89]:
dup_bowtie.columns[0:57]

Index(['ppn', 'sizeGB', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
       'ft9', 'ft10', 'ft11', 'ft12', 'ft13', 'ft14', 'ft15', 'ft16', 'ft17',
       'ft18', 'ft24', 'ft25', 'ft19', 'ft39', 'ft20', 'ft33', 'ft21', 'ft35',
       'ft22', 'ft23', 'ft34', 'ft36', 'ft37', 'ft38', 'ft40', 'ft41', 'ft42',
       'ft43', 'ft44', 'ft45', 'ft46', 'ft48', 'ft47', 'ft49', 'ft51', 'ft50',
       'ft52', 'ft53', 'ft54', 'ft55', 'ft26', 'ft27', 'ft28', 'ft29', 'ft30',
       'ft31', 'ft32'],
      dtype='object')

In [113]:
ft0 = dup_bowtie.copy(deep=True)

In [134]:
ft1=ft0.iloc[np.random.permutation(len(ft0))]
ft=ft1.reset_index(drop=True)
ft.shape

(96, 58)

### ft_all

In [136]:
ft_all = ft.copy(deep=True) #store a copy - do not change this later

In [137]:
features = ft.columns[0:57]
target = ft.columns[57]
features

Index(['ppn', 'sizeGB', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
       'ft9', 'ft10', 'ft11', 'ft12', 'ft13', 'ft14', 'ft15', 'ft16', 'ft17',
       'ft18', 'ft24', 'ft25', 'ft19', 'ft39', 'ft20', 'ft33', 'ft21', 'ft35',
       'ft22', 'ft23', 'ft34', 'ft36', 'ft37', 'ft38', 'ft40', 'ft41', 'ft42',
       'ft43', 'ft44', 'ft45', 'ft46', 'ft48', 'ft47', 'ft49', 'ft51', 'ft50',
       'ft52', 'ft53', 'ft54', 'ft55', 'ft26', 'ft27', 'ft28', 'ft29', 'ft30',
       'ft31', 'ft32'],
      dtype='object')

In [138]:
target

'y_time'

In [139]:
from sklearn.ensemble.forest import RandomForestRegressor
import numpy as  np

model = RandomForestRegressor(n_estimators=1500, n_jobs=-1)

np.random.seed()
ft['is_train']=np.random.uniform(0,1,len(ft)) <= .6
train, test = ft[ft['is_train']==True], ft[ft['is_train']==False]
del ft['is_train']
len(train)*1.0 / (len(test)+len(train))

0.6041666666666666

In [140]:
X_train = train[features]
Y_train = train[target]

X_test = test[features]
Y_test = test[target].as_matrix()

In [141]:
model.fit(X_train, Y_train.values)
Y_prediction = model.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(Y_test, Y_prediction))
print(rmse)

778.084800082


In [142]:
Y_test #Print Y values of test dataset

array([ 3362.83,   203.98,  2858.61,   435.69,  2942.53,  1697.59,
         861.08,  2116.14,  3356.98,    46.36,  2566.79,   855.2 ,
        3396.63,   426.53,  2872.17,  1665.97,  3428.28,   164.02,
        1301.71,   841.52,  1273.04,  2150.65,  1274.14,   120.67,
        6329.84,   203.5 ,   204.44,  2499.76,   142.07,  2537.84,
        6267.29,  1773.46,  2473.27,    21.58,    97.83,   427.76,
        3373.84,  3391.35])

In [143]:
Y_prediction #Print Y predicted values of test dataset

array([ 3181.35843983,   183.61883333,  2992.75749956,   440.1532519 ,
        2958.94299949,  1712.36939048,  1002.01039833,  2142.59211472,
        3237.99846089,    45.29972667,  2501.77166783,   925.27203667,
        3191.71467594,   436.3055119 ,  2992.75749956,  1676.52188944,
        3191.71467594,   179.72670667,  1214.48484194,  1058.34413989,
        1171.15230528,  2149.13736206,  1171.15230528,   122.95219333,
        2942.72059671,   179.93324   ,   180.50694667,  2498.35199363,
         140.57876667,  2501.77166783,  2942.72059671,  1417.652935  ,
        2477.7685773 ,    24.83914667,    96.69653333,   450.43184207,
        3223.76772083,  3181.35843983])

In [144]:
ft[target].mean(), ft[target].std() * 2

(1389.7422916666662, 2681.7670396685194)

### take out 10% for validation. minimize over train-test, finally validate over the 10%

In [149]:
from sklearn.cross_validation import train_test_split

ft_subset, validationSet = train_test_split(ft_all, test_size = 0.1)
ft = ft_subset.copy(deep=True)

ft.shape, validationSet.shape, ft_all.shape

((86, 58), (10, 58), (96, 58))

In [150]:
while(rmse > 1000):
    np.random.seed()
    ft['is_train']=np.random.uniform(0,1,len(ft)) <= .8
    train, test = ft[ft['is_train']==True], ft[ft['is_train']==False]
    del ft['is_train']
    len(train)*1.0 / (len(test)+len(train))
    features = ft.columns[0:2]
    target = ft.columns[2]
    X_train = train[features]
    Y_train = train[target]
    X_test = test[features]
    Y_test = test[target].as_matrix()
    model.fit(X_train, Y_train.values)
    Y_prediction = model.predict(X_test)
    rmse_testSet = np.sqrt(mean_squared_error(Y_test, Y_prediction))


In [151]:
rmse_testSet

577.21092546484215

In [152]:
Y_test #Print Y values of test dataset

array([ 3362.83,   203.98,  2858.61,   435.69,  2942.53,  1697.59,
         861.08,  2116.14,  3356.98,    46.36,  2566.79,   855.2 ,
        3396.63,   426.53,  2872.17,  1665.97,  3428.28,   164.02,
        1301.71,   841.52,  1273.04,  2150.65,  1274.14,   120.67,
        6329.84,   203.5 ,   204.44,  2499.76,   142.07,  2537.84,
        6267.29,  1773.46,  2473.27,    21.58,    97.83,   427.76,
        3373.84,  3391.35])

In [153]:
Y_prediction #Print Y predicted values of test dataset

array([ 3181.35843983,   183.61883333,  2992.75749956,   440.1532519 ,
        2958.94299949,  1712.36939048,  1002.01039833,  2142.59211472,
        3237.99846089,    45.29972667,  2501.77166783,   925.27203667,
        3191.71467594,   436.3055119 ,  2992.75749956,  1676.52188944,
        3191.71467594,   179.72670667,  1214.48484194,  1058.34413989,
        1171.15230528,  2149.13736206,  1171.15230528,   122.95219333,
        2942.72059671,   179.93324   ,   180.50694667,  2498.35199363,
         140.57876667,  2501.77166783,  2942.72059671,  1417.652935  ,
        2477.7685773 ,    24.83914667,    96.69653333,   450.43184207,
        3223.76772083,  3181.35843983])

In [154]:
import pickle as cPickle
# save the regression model
with open('dup_bowtie_rf.pkl', 'wb') as fid:
    cPickle.dump(model, fid)

In [156]:
#check accuracy on validation set
X_validation = validationSet[features]
Y_validation = validationSet[target].as_matrix()
Y_prediction = model.predict(X_validation)
rmse_validationSet = np.sqrt(mean_squared_error(Y_validation, Y_prediction))
rmse_validationSet

94.572848704301691

In [157]:
validationSet[target].mean(), validationSet[target].std() * 2

(1227.412, 2845.1174905292673)