In [1]:
import pandas as pd
import numpy as np


In [2]:
from preprocessing import df
from empirical_formulas import apply_formulas


In [3]:
from IPython import display

In [4]:
df.head()

Unnamed: 0,sex,age,Cys,Scr,rGFR,FAScr,FAScys,FAScrcys,EPIcr,EPIcys,EPIcrcys
0,1,38,3.35,8.69,83.7,11.4,26.9,16.0,7.1,17.1,10.0
1,1,38,3.35,8.69,63.0,11.4,26.9,16.0,7.1,17.1,10.0
2,1,59,8.89,6.72,11.7,11.4,7.9,9.3,8.4,4.3,5.0
3,2,43,11.48,6.55,9.9,11.1,7.4,8.9,7.1,3.0,4.0
4,1,32,7.38,6.4,10.5,16.6,13.1,14.7,10.7,6.1,7.0


In [5]:
df.corr()

Unnamed: 0,sex,age,Cys,Scr,rGFR,FAScr,FAScys,FAScrcys,EPIcr,EPIcys,EPIcrcys
sex,1.0,-0.007743,-0.095576,-0.060239,0.091701,-0.083213,0.261361,0.081258,-0.100085,0.200084,0.074212
age,-0.007743,1.0,0.155757,0.30877,-0.440609,-0.680546,-0.536314,-0.652213,-0.600863,-0.418263,-0.517674
Cys,-0.095576,0.155757,1.0,0.854291,-0.656045,-0.540841,-0.617365,-0.626052,-0.599314,-0.717181,-0.691067
Scr,-0.060239,0.30877,0.854291,1.0,-0.729777,-0.725042,-0.671348,-0.75079,-0.786709,-0.768627,-0.808486
rGFR,0.091701,-0.440609,-0.656045,-0.729777,1.0,0.740067,0.773793,0.812752,0.774939,0.822765,0.844125
FAScr,-0.083213,-0.680546,-0.540841,-0.725042,0.740067,1.0,0.759255,0.938154,0.978269,0.752017,0.896489
FAScys,0.261361,-0.536314,-0.617365,-0.671348,0.773793,0.759255,1.0,0.927412,0.757601,0.940487,0.915956
FAScrcys,0.081258,-0.652213,-0.626052,-0.75079,0.812752,0.938154,0.927412,1.0,0.931711,0.903734,0.972063
EPIcr,-0.100085,-0.600863,-0.599314,-0.786709,0.774939,0.978269,0.757601,0.931711,1.0,0.791709,0.929671
EPIcys,0.200084,-0.418263,-0.717181,-0.768627,0.822765,0.752017,0.940487,0.903734,0.791709,1.0,0.957124


In [6]:
from sklearn.svm import SVR


这里为了和经验公式公平对决，使用所有样本拟合的得出的模型，其超参数在之前的测试时已保证其不会过拟合，它们的测试集分数甚至比训练集高或者至多低一个百分点。

In [7]:
class ModelWithTransform:
    
    def __init__(self, base_model, *, scale_method, sex_method, normalize, normalize_y):
        self.base_model = base_model
        
        assert sex_method != 'sex2'
        
        self.scale_method = scale_method
        self.sex_method = sex_method
        self.normalize = normalize
        self.normalize_y = normalize_y
        
    def encode_X(self, df):
        if self.scale_method == 'linear_scale':
            #y = df['rGFR']
            if self.sex_method == 'dummy':
                X = df[['age', 'Scr', 'Cys', 'sex']]
            else:
                X = df[['age', 'Scr', 'Cys']]

        else:
            #y = np.log(df['rGFR'])
            if self.sex_method == 'dummy':
                X = np.stack([df['age'], np.log(df['Scr']), np.log(df['Cys']), df['sex']], 1)
            else:
                X = np.stack([df['age'], np.log(df['Scr']), np.log(df['Cys'])], 1)

        if self.normalize:
            mean = X.mean(axis=0)
            std = X.std(axis=0)
            X = (X - mean)/std
        if self.normalize_y:
            raise NotImplementedError
        
        return X
    
    def encode_y(self, df):
        if self.scale_method == 'linear_scale':
            y = df['rGFR']
        else:
            y = np.log(df['rGFR'])
        
        return y
    
    def decode_y(self, y):
        if self.scale_method == 'log_scale':
            return np.exp(y)
        return y
    
    def fit(self, df):
        X = self.encode_X(df)
        y = self.encode_y(df)
        
        self.base_model.fit(X, y)
        
    def predict(self, df):
        X = self.encode_X(df)
        y_raw = self.base_model.predict(X)
        y = self.decode_y(y_raw)
        return y

In [8]:
model_t = ModelWithTransform(SVR(kernel='rbf', C=100.0, gamma=0.1, epsilon=0.05), 
                    scale_method = 'log_scale',
                    sex_method = 'dummy',
                    normalize = True,
                    normalize_y = False)

In [9]:
model_t.fit(df)

In [10]:
model_t.predict(df)

array([ 66.23581572,  66.23581572,   8.82708882, ..., 123.49917747,
       120.52058624,  74.58182062])

In [11]:
df['SVR'] = model_t.predict(df)

In [12]:
df.corr()

Unnamed: 0,sex,age,Cys,Scr,rGFR,FAScr,FAScys,FAScrcys,EPIcr,EPIcys,EPIcrcys,SVR
sex,1.0,-0.007743,-0.095576,-0.060239,0.091701,-0.083213,0.261361,0.081258,-0.100085,0.200084,0.074212,0.093741
age,-0.007743,1.0,0.155757,0.30877,-0.440609,-0.680546,-0.536314,-0.652213,-0.600863,-0.418263,-0.517674,-0.504135
Cys,-0.095576,0.155757,1.0,0.854291,-0.656045,-0.540841,-0.617365,-0.626052,-0.599314,-0.717181,-0.691067,-0.753731
Scr,-0.060239,0.30877,0.854291,1.0,-0.729777,-0.725042,-0.671348,-0.75079,-0.786709,-0.768627,-0.808486,-0.844822
rGFR,0.091701,-0.440609,-0.656045,-0.729777,1.0,0.740067,0.773793,0.812752,0.774939,0.822765,0.844125,0.865759
FAScr,-0.083213,-0.680546,-0.540841,-0.725042,0.740067,1.0,0.759255,0.938154,0.978269,0.752017,0.896489,0.860934
FAScys,0.261361,-0.536314,-0.617365,-0.671348,0.773793,0.759255,1.0,0.927412,0.757601,0.940487,0.915956,0.892676
FAScrcys,0.081258,-0.652213,-0.626052,-0.75079,0.812752,0.938154,0.927412,1.0,0.931711,0.903734,0.972063,0.941364
EPIcr,-0.100085,-0.600863,-0.599314,-0.786709,0.774939,0.978269,0.757601,0.931711,1.0,0.791709,0.929671,0.901047
EPIcys,0.200084,-0.418263,-0.717181,-0.768627,0.822765,0.752017,0.940487,0.903734,0.791709,1.0,0.957124,0.948592


In [13]:
apply_formulas(df)
df.corr()

Unnamed: 0,sex,age,Cys,Scr,rGFR,FAScr,FAScys,FAScrcys,EPIcr,EPIcys,EPIcrcys,SVR,CKD_EPI_Cr,CKD_EPI_Cys,CKD_EPI_Cr_Cys,FAS_cr,FAS_Cys,FAS_Cr_Cys
sex,1.0,-0.007743,-0.095576,-0.060239,0.091701,-0.083213,0.261361,0.081258,-0.100085,0.200084,0.074212,0.093741,-0.081871,0.200089,0.074111,-0.090783,0.274252,0.083872
age,-0.007743,1.0,0.155757,0.30877,-0.440609,-0.680546,-0.536314,-0.652213,-0.600863,-0.418263,-0.517674,-0.504135,-0.602757,-0.418232,-0.517767,-0.623763,-0.476405,-0.593327
Cys,-0.095576,0.155757,1.0,0.854291,-0.656045,-0.540841,-0.617365,-0.626052,-0.599314,-0.717181,-0.691067,-0.753731,-0.601871,-0.717176,-0.691028,-0.566568,-0.646483,-0.658845
Scr,-0.060239,0.30877,0.854291,1.0,-0.729777,-0.725042,-0.671348,-0.75079,-0.786709,-0.768627,-0.808486,-0.844822,-0.789165,-0.768638,-0.808378,-0.755141,-0.696687,-0.784444
rGFR,0.091701,-0.440609,-0.656045,-0.729777,1.0,0.740067,0.773793,0.812752,0.774939,0.822765,0.844125,0.865759,0.778504,0.822771,0.844254,0.753606,0.78698,0.831135
FAScr,-0.083213,-0.680546,-0.540841,-0.725042,0.740067,1.0,0.759255,0.938154,0.978269,0.752017,0.896489,0.860934,0.978391,0.752014,0.896506,0.987302,0.73551,0.922982
FAScys,0.261361,-0.536314,-0.617365,-0.671348,0.773793,0.759255,1.0,0.927412,0.757601,0.940487,0.915956,0.892676,0.765593,0.940484,0.915898,0.741709,0.988484,0.916626
FAScrcys,0.081258,-0.652213,-0.626052,-0.75079,0.812752,0.938154,0.927412,1.0,0.931711,0.903734,0.972063,0.941364,0.935844,0.903728,0.972045,0.922504,0.909042,0.987055
EPIcr,-0.100085,-0.600863,-0.599314,-0.786709,0.774939,0.978269,0.757601,0.931711,1.0,0.791709,0.929671,0.901047,0.999743,0.791709,0.929721,0.982411,0.749873,0.934564
EPIcys,0.200084,-0.418263,-0.717181,-0.768627,0.822765,0.752017,0.940487,0.903734,0.791709,1.0,0.957124,0.948592,0.797619,1.0,0.957136,0.762055,0.957963,0.922992


In [14]:
def desc(df, key_obs, key_pred):
    rd = {}
    for key,_df in zip(["All subjects", "Age<60 y", "Age>=60y"],[df, df[df['age']<60], df[df['age']>=60]]):
        y_pred = _df[key_pred]
        rd[key] = '{}({})'.format(y_pred.mean(), y_pred.std())
    return rd

In [15]:
def batch_apply(df, f):
    key_obs = 'rGFR'
    record = []
    keys = ['CKD_EPI_Cys', 'CKD_EPI_Cr_Cys', 'FAS_Cys', 'FAS_Cr_Cys', 'SVR']
    for key_pred in keys:
        r = f(df, key_obs, key_pred)
        record.append(r)
    rdf = pd.DataFrame(record)
    rdf.index = keys
    return rdf

In [16]:
batch_apply(df, desc)

Unnamed: 0,All subjects,Age<60 y,Age>=60y
CKD_EPI_Cys,69.3962085948395(39.1517378356909),80.90386106107096(39.14754940746409),51.993331519253275(32.12753816195782)
CKD_EPI_Cr_Cys,64.27641958959802(35.48006790892703),77.0652300118479(34.95017038821175),44.936063563511276(26.36529880065677)
FAS_Cys,71.88361925596095(42.466317375049705),86.3526842984172(44.56617013166695),50.00223544884381(27.080307487568675)
FAS_Cr_Cys,63.847679107672455(33.528004610544365),78.02836674241843(32.74794994389674),42.40240480181842(20.953145653776453)
SVR,65.30995460395278(25.639187542415552),74.75305834407989(24.30807419863831),51.0292684185053(20.488751506424602)


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>sex</th>
      <th>age</th>
      <th>Cys</th>
      <th>Scr</th>
      <th>rGFR</th>
      <th>FAScr</th>
      <th>FAScys</th>
      <th>FAScrcys</th>
      <th>EPIcr</th>
      <th>EPIcys</th>
      <th>EPIcrcys</th>
      <th>SVR</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>38</td>
      <td>3.35</td>
      <td>8.69</td>
      <td>83.7</td>
      <td>11.4</td>
      <td>26.9</td>
      <td>16.0</td>
      <td>7.1</td>
      <td>17.1</td>
      <td>10.0</td>
      <td>66.235816</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>38</td>
      <td>3.35</td>
      <td>8.69</td>
      <td>63.0</td>
      <td>11.4</td>
      <td>26.9</td>
      <td>16.0</td>
      <td>7.1</td>
      <td>17.1</td>
      <td>10.0</td>
      <td>66.235816</td>
    </tr>
    <tr>
      <th>2</th>
      <td>1</td>
      <td>59</td>
      <td>8.89</td>
      <td>6.72</td>
      <td>11.7</td>
      <td>11.4</td>
      <td>7.9</td>
      <td>9.3</td>
      <td>8.4</td>
      <td>4.3</td>
      <td>5.0</td>
      <td>8.827089</td>
    </tr>
    <tr>
      <th>3</th>
      <td>2</td>
      <td>43</td>
      <td>11.48</td>
      <td>6.55</td>
      <td>9.9</td>
      <td>11.1</td>
      <td>7.4</td>
      <td>8.9</td>
      <td>7.1</td>
      <td>3.0</td>
      <td>4.0</td>
      <td>10.407108</td>
    </tr>
    <tr>
      <th>4</th>
      <td>1</td>
      <td>32</td>
      <td>7.38</td>
      <td>6.40</td>
      <td>10.5</td>
      <td>16.6</td>
      <td>13.1</td>
      <td>14.7</td>
      <td>10.7</td>
      <td>6.1</td>
      <td>7.0</td>
      <td>12.091119</td>
    </tr>
    <tr>
      <th>5</th>
      <td>2</td>
      <td>59</td>
      <td>11.49</td>
      <td>6.38</td>
      <td>16.2</td>
      <td>9.4</td>
      <td>6.1</td>
      <td>7.4</td>
      <td>6.6</td>
      <td>2.8</td>
      <td>4.0</td>
      <td>15.405799</td>
    </tr>
    <tr>
      <th>6</th>
      <td>2</td>
      <td>73</td>
      <td>8.47</td>
      <td>6.35</td>
      <td>13.9</td>
      <td>7.9</td>
      <td>8.1</td>
      <td>8.0</td>
      <td>6.0</td>
      <td>4.0</td>
      <td>4.0</td>
      <td>14.526554</td>
    </tr>
    <tr>
      <th>7</th>
      <td>2</td>
      <td>39</td>
      <td>8.15</td>
      <td>6.29</td>
      <td>11.5</td>
      <td>12.1</td>
      <td>10.9</td>
      <td>11.5</td>
      <td>7.7</td>
      <td>4.9</td>
      <td>5.0</td>
      <td>10.795628</td>
    </tr>
    <tr>
      <th>8</th>
      <td>1</td>
      <td>56</td>
      <td>3.59</td>
      <td>6.25</td>
      <td>14.3</td>
      <td>12.7</td>
      <td>20.2</td>
      <td>15.6</td>
      <td>9.3</td>
      <td>14.5</td>
      <td>11.0</td>
      <td>25.880579</td>
    </tr>
    <tr>
      <th>9</th>
      <td>1</td>
      <td>29</td>
      <td>16.14</td>
      <td>6.16</td>
      <td>3.3</td>
      <td>17.9</td>
      <td>6.2</td>
      <td>9.2</td>
      <td>11.5</td>
      <td>2.2</td>
      <td>4.0</td>
      <td>3.468565</td>
    </tr>
  </tbody>
</table>

In [17]:
def fitness(df, key_obs, key_pred):
    R = np.corrcoef(df[key_obs],df[key_pred])[0,1]
    return dict(R=R)

In [18]:
batch_apply(df, fitness)

Unnamed: 0,R
CKD_EPI_Cys,0.822771
CKD_EPI_Cr_Cys,0.844254
FAS_Cys,0.78698
FAS_Cr_Cys,0.831135
SVR,0.865759


In [19]:
for key, _df in zip(["All subjects", "Age<60 y", "Age>=60y"],[df, df[df['age']<60], df[df['age']>=60]]):
    print(key)
    display.display(batch_apply(_df, fitness))

All subjects


Unnamed: 0,R
CKD_EPI_Cys,0.822771
CKD_EPI_Cr_Cys,0.844254
FAS_Cys,0.78698
FAS_Cr_Cys,0.831135
SVR,0.865759


Age<60 y


Unnamed: 0,R
CKD_EPI_Cys,0.79232
CKD_EPI_Cr_Cys,0.806745
FAS_Cys,0.74316
FAS_Cr_Cys,0.794031
SVR,0.833056


Age>=60y


Unnamed: 0,R
CKD_EPI_Cys,0.798121
CKD_EPI_Cr_Cys,0.832341
FAS_Cys,0.78258
FAS_Cr_Cys,0.833953
SVR,0.852883


In [20]:
def Bias_Precision_Accuracy(df, key_obs, key_pred):
    # Median difference, IQR of the difference, P30, RMSE
    diff = df[key_pred] - df[key_obs]
    
    r = (df[key_obs].max() - df[key_obs].min())*0.15
    left = df[key_obs] - r
    right = df[key_obs] + r
    p30 = np.mean((df[key_pred] > left) & (df[key_pred] < right))
    
    return dict(Bias = diff.median(),
                IQR = np.quantile(diff, 0.75) - np.quantile(diff, 0.25),
                P30 = p30 *100,
                RMSE = np.sqrt(np.mean(diff**2)))

In [21]:
Bias_Precision_Accuracy(df, 'rGFR', 'SVR')

{'Bias': -0.17644451961754726,
 'IQR': 16.219300632067856,
 'P30': 87.2836719337848,
 'RMSE': 14.615718515667071}

In [22]:
for key, _df in zip(["All subjects", "rGFR>=60", "rGFR<60"],[df, df[df['rGFR']>=60], df[df['rGFR']<60]]):
    print(key)
    display.display(batch_apply(_df, Bias_Precision_Accuracy))

All subjects


Unnamed: 0,Bias,IQR,P30,RMSE
CKD_EPI_Cys,0.393093,29.607459,69.149737,22.736234
CKD_EPI_Cr_Cys,-2.828487,21.887123,78.781038,19.0837
FAS_Cys,1.461448,26.308302,71.256584,27.220508
FAS_Cr_Cys,-2.943543,20.67221,79.382995,18.784712
SVR,-0.176445,16.219301,87.283672,14.615719


rGFR>=60


Unnamed: 0,Bias,IQR,P30,RMSE
CKD_EPI_Cys,6.520864,32.004656,40.179718,24.340743
CKD_EPI_Cr_Cys,-0.470286,27.331665,48.138639,20.206874
FAS_Cys,4.132352,34.735986,42.618742,32.247556
FAS_Cr_Cys,-3.709919,25.95067,50.192555,20.807367
SVR,-3.311359,16.78299,68.421053,15.112729


rGFR<60


Unnamed: 0,Bias,IQR,P30,RMSE
CKD_EPI_Cys,-4.284529,20.989222,39.272727,20.247239
CKD_EPI_Cr_Cys,-5.783487,15.710296,46.545455,17.369032
FAS_Cys,-0.462415,18.209322,47.272727,17.819535
FAS_Cr_Cys,-2.645289,15.351126,50.727273,15.473895
SVR,3.836448,13.74704,54.909091,13.881353


In [23]:
class HTMLDocument:
    
    def __init__(self):
        self.head = '''<head>
<link rel="stylesheet" href="css/style.min.css" type="text/css">
</head>
'''
        self.header = '''<html>'''+self.head+'''<body>'''
        self.footer = '''</body></html>'''
        self.el_list = []
        
    def add(self, el):
        self.el_list.append(el)
        
    def render(self):
        return '\n'.join([self.header] + self.el_list + [self.footer])
    
    def add_p(self, s):
        self.add('<p>{}</p>'.format(s))

In [57]:
doc = HTMLDocument()

doc.add(batch_apply(df, desc).to_html())

#doc.add(batch_apply(df, fitness).to_html())

for key, _df in zip(["All subjects", "Age<60 y", "Age>=60y"],[df, df[df['age']<60], df[df['age']>=60]]):
    doc.add_p(key)
    doc.add(batch_apply(_df, fitness).to_html())
    
for key, _df in zip(["All subjects", "rGFR>=60", "rGFR<60"],[df, df[df['rGFR']>=60], df[df['rGFR']<60]]):
    doc.add_p(key)
    doc.add(batch_apply(_df, Bias_Precision_Accuracy).to_html())

In [58]:
display.HTML(doc.render())

Unnamed: 0,All subjects,Age<60 y,Age>=60y
CKD_EPI_Cys,69.3962085948395(39.1517378356909),80.90386106107096(39.14754940746409),51.993331519253275(32.12753816195782)
CKD_EPI_Cr_Cys,64.27641958959802(35.48006790892703),77.0652300118479(34.95017038821175),44.936063563511276(26.36529880065677)
FAS_Cys,71.88361925596095(42.466317375049705),86.3526842984172(44.56617013166695),50.00223544884381(27.080307487568675)
FAS_Cr_Cys,63.847679107672455(33.528004610544365),78.02836674241843(32.74794994389674),42.40240480181842(20.953145653776453)
SVR,65.30995460395278(25.639187542415552),74.75305834407989(24.30807419863831),51.0292684185053(20.488751506424602)

Unnamed: 0,R
CKD_EPI_Cys,0.822771
CKD_EPI_Cr_Cys,0.844254
FAS_Cys,0.78698
FAS_Cr_Cys,0.831135
SVR,0.865759

Unnamed: 0,R
CKD_EPI_Cys,0.79232
CKD_EPI_Cr_Cys,0.806745
FAS_Cys,0.74316
FAS_Cr_Cys,0.794031
SVR,0.833056

Unnamed: 0,R
CKD_EPI_Cys,0.798121
CKD_EPI_Cr_Cys,0.832341
FAS_Cys,0.78258
FAS_Cr_Cys,0.833953
SVR,0.852883

Unnamed: 0,Bias,IQR,P30,RMSE
CKD_EPI_Cys,0.393093,29.607459,69.149737,22.736234
CKD_EPI_Cr_Cys,-2.828487,21.887123,78.781038,19.0837
FAS_Cys,1.461448,26.308302,71.256584,27.220508
FAS_Cr_Cys,-2.943543,20.67221,79.382995,18.784712
SVR,-0.176445,16.219301,87.283672,14.615719

Unnamed: 0,Bias,IQR,P30,RMSE
CKD_EPI_Cys,6.520864,32.004656,40.179718,24.340743
CKD_EPI_Cr_Cys,-0.470286,27.331665,48.138639,20.206874
FAS_Cys,4.132352,34.735986,42.618742,32.247556
FAS_Cr_Cys,-3.709919,25.95067,50.192555,20.807367
SVR,-3.311359,16.78299,68.421053,15.112729

Unnamed: 0,Bias,IQR,P30,RMSE
CKD_EPI_Cys,-4.284529,20.989222,39.272727,20.247239
CKD_EPI_Cr_Cys,-5.783487,15.710296,46.545455,17.369032
FAS_Cys,-0.462415,18.209322,47.272727,17.819535
FAS_Cr_Cys,-2.645289,15.351126,50.727273,15.473895
SVR,3.836448,13.74704,54.909091,13.881353


In [59]:
with open('table.html', 'w') as f:
    f.write(doc.render())

In [28]:
model_t.base_model

SVR(C=100.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.05, gamma=0.1,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [29]:
model_t.base_model.support_vectors_.shape

(1066, 4)

In [30]:
model_t.base_model.dual_coef_.shape

(1, 1066)

In [31]:
model_t.base_model.gamma

0.1

In [32]:
model_t.base_model.intercept_

array([5.0567044])

In [33]:
np.log(df[['Scr', 'Cys']]).describe()

Unnamed: 0,Scr,Cys
count,1329.0,1329.0
mean,0.346305,0.25382
std,0.554502,0.692911
min,-1.049822,-1.386294
25%,-0.051293,-0.210721
50%,0.19062,0.076961
75%,0.698135,0.536493
max,2.162173,2.781301


In [34]:
df[['age', 'sex']].describe()

Unnamed: 0,age,sex
count,1329.0,1329.0
mean,53.899172,1.389014
std,16.325116,0.48771
min,14.0,1.0
25%,41.0,1.0
50%,55.0,1.0
75%,67.0,2.0
max,91.0,2.0


In [35]:
df.head()

Unnamed: 0,sex,age,Cys,Scr,rGFR,FAScr,FAScys,FAScrcys,EPIcr,EPIcys,EPIcrcys,SVR,CKD_EPI_Cr,CKD_EPI_Cys,CKD_EPI_Cr_Cys,FAS_cr,FAS_Cys,FAS_Cr_Cys
0,1,38,3.35,8.69,83.7,11.4,26.9,16.0,7.1,17.1,10.0,66.235816,6.961372,17.051025,10.316914,11.112773,26.264478,15.617584
1,1,38,3.35,8.69,63.0,11.4,26.9,16.0,7.1,17.1,10.0,66.235816,6.961372,17.051025,10.316914,11.112773,26.264478,15.617584
2,1,59,8.89,6.72,11.7,11.4,7.9,9.3,8.4,4.3,5.0,8.827089,8.196237,4.28859,5.414697,11.424966,7.868533,9.318965
3,2,43,11.48,6.55,9.9,11.1,7.4,8.9,7.1,3.0,4.0,10.407108,7.129646,3.034728,4.112984,11.059291,7.391669,8.860961
4,1,32,7.38,6.4,10.5,16.6,13.1,14.7,10.7,6.1,7.0,12.091119,10.509999,6.118925,7.287303,15.089062,11.922222,13.32


In [47]:
import math

# prevent numpy to make transfering it to JavaScript easily

def rbf(X,Y, gamma):    
    # X,Y is a Python/JavaScript list instead of numpy.array
    
    norm = 0.0
    for i in range(len(X)):
        norm += (X[i] - Y[i])**2
    
    return math.exp(-gamma * norm)

def predict(X, support_vectors, dual_coef, intercept, gamma):
    '''
    kernel_mat = rbf(X, support_vectors, gamma)
    pred = kernel_mat @ dual_coef.transpose() + intercept
    return pred
    '''
    kernel_mat = []
    pred = intercept
    for i in range(len(support_vectors)):
        w = rbf(X, support_vectors[i], gamma)
        pred += w * dual_coef[i]
    return pred

In [37]:
log_Scr_mean = 0.346305
log_Scr_std = 0.554502
log_Cys_mean = 0.253820
log_Cys_std = 0.692911

age_mean = 53.899172
age_std = 16.325116
sex_mean = 1.389014
sex_std = 0.487710



In [38]:
support_vectors = model_t.base_model.support_vectors_.tolist()
dual_coef = model_t.base_model.dual_coef_[0].tolist()
intercept = model_t.base_model.intercept_[0]
gamma = model_t.base_model.gamma

In [39]:
#support_vectors,dual_coef,intercept,gamma

In [40]:
np.array(support_vectors).shape, np.array(dual_coef).shape,intercept,gamma

((1066, 4), (1066,), 5.056704398903536, 0.1)

In [50]:
key_list = ['age', 'Scr', 'Cys', 'sex']
scale_map = dict(age = 'linear', Scr = 'log', Cys = 'log', sex = 'linear')

mean_map = {}
std_map = {}

for key in key_list:
    if scale_map[key] == 'log':
        x = np.log(df[key])
    else:
        x = df[key]
        
    mean_map[key] = x.mean()
    std_map[key] = x.std()
    
y_log = True

In [42]:
mean_map, std_map

({'age': 53.89917231000752,
  'Scr': 0.34630462846000515,
  'Cys': 0.2538197312345424,
  'sex': 1.3890142964635064},
 {'age': 16.32511575563543,
  'Scr': 0.5545015820484048,
  'Cys': 0.6929106463771294,
  'sex': 0.4877101097484836})

In [43]:
inp = dict(age = 38, Scr = 8.69, Cys = 3.35, sex = 1)

inp_scaled = {}
for key in inp:
    if scale_map[key] == 'log':
        x = math.log(inp[key])
    else:
        x = inp[key]
    
    x_scaled = (x - mean_map[key])/std_map[key]
    inp_scaled[key] = x_scaled
    
inp_scaled

{'age': -0.973908702884335,
 'Scr': 3.2747757077792805,
 'Cys': 1.3784470185244921,
 'sex': -0.7976342681600849}

In [45]:
inp_vec = []
for key in key_list:
    inp_vec.append(inp_scaled[key])
    
inp_vec

[-0.973908702884335,
 3.2747757077792805,
 1.3784470185244921,
 -0.7976342681600849]

In [49]:
y_raw = predict(inp_vec, support_vectors, dual_coef, intercept, gamma)

In [51]:
if y_log:
    y = math.exp(y_raw)
else:
    y = y_raw

In [52]:
y

66.1966952064643

In [53]:
import json

In [54]:
frozen = dict(
    support_vectors = support_vectors,
    dual_coef = dual_coef,
    intercept = intercept,
    gamma = gamma,
    
    key_list = key_list,
    scale_map = scale_map,
    mean_map = mean_map,
    std_map = std_map,
    
    y_log = y_log
)

In [56]:
jsons = json.dumps(frozen)
jsonps = 'var frozen='+jsons+';'
with open('frozen_model.js', 'w') as f:
    f.write(jsonps)

In [61]:
df.to_excel("df.xlsx")