# Trainings data
This script generates the trainings data for the predictive model.

In [1]:
import pandas as pd
import numpy as np

In [2]:
dtest = pd.read_csv('./data/flutype_test.csv', sep='\t')
dtest

Unnamed: 0,virus,P1,P3,P4,P5,P6,P7,P8,P10,P13
0,X31,1569,2185,3988,3104,161,394,788,4396,443
1,X31,1840,2203,5003,2975,148,613,726,4284,486
2,X31,2039,2269,5163,3067,126,689,692,3493,372
3,X31,1168,1578,4211,1948,172,568,258,3472,314
4,H1,510,892,2609,2874,484,849,1650,2791,1445
5,H1,524,971,3227,2727,573,1384,1527,2950,1482
6,H1,580,657,3020,2175,732,1951,1947,2907,1174
7,H1,504,1007,1469,1993,521,1598,1802,2952,1325
8,H5,702,1504,2476,2942,374,379,627,1279,1697
9,H5,1187,1974,2864,3707,521,903,590,1328,1736


In [3]:
# set seed for reproducible sampling
np.random.seed(seed=1234)

# create sample data from given mean and variance
Nsample = 50
names = dtest.virus.unique().tolist()
features = dtest.columns[1:]
dfs = []
for name in names:
    print(name)
    dv = dtest[dtest.virus == name]
    # empty data frame
    df = pd.DataFrame(index=range(Nsample), columns=dv.columns)
    df['virus'] = [name]*Nsample
    
    for feature in features:
        values = dv[feature].values
        # print(values, np.mean(values), np.std(values))
        # now sample for the given feature
        m, std = np.mean(values), np.std(values)
        
        mu = np.log(m/np.sqrt(1+std**2/m**2))
        sigma = np.sqrt(np.log(1+std**2/m**2))
        df[feature] = np.round(np.random.lognormal(mu, sigma, Nsample))
        
        print(m, std)
        print(np.mean(df[feature]), np.std(df[feature]))
        print('-'*80)
    dfs.append(df)


X31
(1654.0, 326.43605805731693)
(1675.1400000000001, 305.09087236428428)
--------------------------------------------------------------------------------
(2058.75, 279.31735982570075)
(2057.9200000000001, 273.16733626112767)
--------------------------------------------------------------------------------
(4591.25, 501.23266802952895)
(4647.6800000000003, 495.37563282826102)
--------------------------------------------------------------------------------
(2773.5, 478.91152627599183)
(2669.1799999999998, 436.71371354698715)
--------------------------------------------------------------------------------
(151.75, 17.122718826167766)
(156.40000000000001, 16.644518617250547)
--------------------------------------------------------------------------------
(566.0, 108.31204919121417)
(567.62, 100.56736846512391)
--------------------------------------------------------------------------------
(616.0, 209.53758612716717)
(636.65999999999997, 183.57686237649884)
--------------------------------

In [4]:
dtrain = pd.concat(dfs)
dtrain.to_csv('./data/flutype_train.csv', sep='\t', index=False)

In [5]:
dtrain

Unnamed: 0,virus,P1,P3,P4,P5,P6,P7,P8,P10,P13
0,X31,1779.0,2285.0,4711.0,2976.0,145.0,792.0,694.0,3709.0,384.0
1,X31,1286.0,1678.0,4854.0,3133.0,141.0,402.0,738.0,4583.0,343.0
2,X31,2147.0,1688.0,4821.0,2520.0,153.0,426.0,543.0,3737.0,422.0
3,X31,1526.0,2012.0,4708.0,2707.0,141.0,569.0,1396.0,3697.0,324.0
4,X31,1409.0,1894.0,4811.0,3449.0,170.0,556.0,328.0,3357.0,386.0
5,X31,1930.0,2001.0,5294.0,2381.0,138.0,437.0,565.0,3870.0,370.0
6,X31,1920.0,2140.0,4192.0,1899.0,142.0,474.0,936.0,4254.0,468.0
7,X31,1433.0,2030.0,4337.0,2581.0,152.0,617.0,710.0,4894.0,499.0
8,X31,1628.0,2202.0,5215.0,2348.0,187.0,505.0,617.0,4476.0,475.0
9,X31,1047.0,2514.0,3970.0,2894.0,165.0,479.0,327.0,4005.0,428.0
