In [1]:
import pandas as pd
import numpy as np
import pickle
import gc; gc.enable()

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [3]:
file_prefix = ['image_top_1_resnet', 'image_top_1_inception', 'image_top_1_xception']

for fp in file_prefix:
    merged_train = pd.read_pickle(fp+'_train')
    merged_test = pd.read_pickle(fp+'_test')
    
    merge_all = pd.concat([merged_train, merged_test], axis=0)
    merge_ids = merge_all['image'].values
    merge_all.drop('image', axis=1, inplace=True)
    del merged_train, merged_test; gc.collect()
    
    merge_all.loc[:, fp] = LabelEncoder().fit_transform(merge_all[fp])
    ohe_merge_all = OneHotEncoder().fit_transform(merge_all[[fp]])
    ohe_merge_all = ohe_merge_all.multiply(csr_matrix(merge_all[[fp]].values)) # weighted by the confidence
    del merge_all; gc.collect()
    
    svd_merge_all = TruncatedSVD(n_components=3).fit_transform(ohe_merge_all)
    del ohe_merge_all; gc.collect()
    
    df = pd.DataFrame()
    df['image'] = [sid.replace('.jpg', '') for sid in merge_ids]
    for i in range(svd_merge_all.shape[1]):
        df[fp+'_tsvd_{}'.format(i)] = svd_merge_all[:,i]
    df.to_pickle(fp+'_tsvd.csv_pkl')
    print(df.head(3))

                                               image  \
0  00000acce78ccf00d3ca0c550cae60fb319d45861444b3...   
1  00001d464b8eb4f0f90b13b9194dc214c492cbe0c484fa...   
2  00002821738c1efaa7e73310f7a6e34d34ada6c68e0800...   

   image_top_1_resnet_tsvd_0  image_top_1_resnet_tsvd_1  \
0              -3.839561e-04              -8.169812e-03   
1               1.499953e-10               4.053217e-10   
2               4.757338e-01              -1.544728e+00   

   image_top_1_resnet_tsvd_2  
0               8.767541e-02  
1               3.851567e-10  
2               9.996041e-02  
                                               image  \
0  00000acce78ccf00d3ca0c550cae60fb319d45861444b3...   
1  00001d464b8eb4f0f90b13b9194dc214c492cbe0c484fa...   
2  00002821738c1efaa7e73310f7a6e34d34ada6c68e0800...   

   image_top_1_inception_tsvd_0  image_top_1_inception_tsvd_1  \
0                      0.006571                     -0.016791   
1                      0.000156                      0.0000