# Support Vector Regression (SVR) Model for Deconvolution of Human Immune Cell Types
### Non-Negative Least Square Problem


$$ min_A(||AS-X||^2), s.t. \bigg\{ \sum_{i=1}^{N} a_{ki}=1 , a_{ki}\geq0, $$

$$x_{ij}=\sum_{i=1}^{N} a_{ki}s_{ij}$$


x_ij: g ene expression level of gene j in a sample k 

a_ki: i cell type proportion for sample k 

s_ij: gene expression level of gene j in i cell type
 
 More generally, matrix form of the problem is:
$$ X=AS$$
X: mixture data 

A: proportion matrix ( desired matrix )

S: signature matrix 






In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.svm import NuSVR
from sklearn.metrics import r2_score
signature_coarse=pd.read_csv('signature_coarse.csv')
signature_coarse=signature_coarse.set_index('gene')
signature_coarse.head()

Unnamed: 0_level_0,CD8,CD4,B_cell,NK,Neutrophils,monocyte,Fibroblast,Endothelial
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000000971,0.0,4.4e-05,0.0,4.5e-05,3e-06,1e-05,0.006356,0.009847
ENSG00000003436,0.0,0.0,7e-06,0.0002,9.5e-05,2.2e-05,0.003832,0.039887
ENSG00000008517,0.05493,0.023485,3.1e-05,0.018015,0.000198,5.7e-05,9.7e-05,0.01484
ENSG00000008988,0.009017,0.010958,0.166875,0.00567,0.000231,0.057349,0.018695,0.120254
ENSG00000009790,0.010526,0.003494,0.008991,0.021176,0.002923,0.006066,2.7e-05,8.7e-05


In [3]:
signature_coarse.shape

(355, 8)

In [4]:
def Nu_SVR(mixture,signature):
    
   #function for having same genes for mixture and signature matrix'
    gene_signature=signature.index.values  
    gene_rna=mixture.index.values
    genes=[genes for genes in gene_rna if genes not in gene_signature] 
    mixture=mixture.drop(index=genes)
    
    
    gene_mixture=mixture.index.values 
    genes_for_signature=[genes for genes in gene_signature if genes not in gene_mixture]    
    signature=signature.drop(index=genes_for_signature)
    
           
    cell_type_data=pd.DataFrame(columns=signature.columns.values.tolist())
    for column in mixture: 
        #support vector regression 
            regr_nusvr = NuSVR(kernel='linear',nu=1)
            train_x=np.asanyarray(signature)
            train_y=np.asanyarray(mixture[column])
            model=regr_nusvr.fit (train_x, train_y)
            result=model.coef_
            #make the negative values zero, and normalized to sum up 1
            result=np.where(result<0, 0, result) 
            normalizer = 1 /  np.sum(result)
            estimated_cell_fraction= [x * normalizer for x in result]
            cell_type=pd.DataFrame(estimated_cell_fraction,index=[column],columns=signature.columns.values.tolist())
            cell_type_data=cell_type_data.append(cell_type)
     
    return cell_type_data

In [5]:
def Linear_SVR(mixture,signature,cell_number):
    'main function to find the proportion matrix'
    "Support Vector Regression "
    
    cell_type_data=pd.DataFrame(columns=signature.columns.values.tolist())
    for column in mixture: 
        #support vector regression 
            regr_svr = LinearSVR(fit_intercept=False,max_iter=2000)
            train_x=np.asanyarray(signature)
            train_y=np.asanyarray(mixture[column])
            model=regr_svr.fit (train_x, train_y)
            result=model.coef_
            result=np.reshape(result,(1,cell_number))
           
            #make the negative values zero, and normalized to sum up 1
            result=np.where(result<0, 0, result) 
            normalizer = 1 /  np.sum(result)
            estimated_cell_fraction= [x * normalizer for x in result]
            cell_type=pd.DataFrame(estimated_cell_fraction,index=[column],columns=signature.columns.values.tolist())
            cell_type_data=cell_type_data.append(cell_type)
    
    return cell_type_data

In [6]:
def generate_test_data_score_NUSVR(samples,cell_number,gene,signature_matrix):
    '''
    
    This function gives the scores of randomly generated synthetic mixture data set for NU_SVR model
       
       sample:  number of samples for synthetic mixture data
       cell_number: # number cell_types
       gene: number of genes in the signature matrix
     
       signature data : given signature matrix'''
    
    #generating synthetic mixture data 
    test_data=np.empty(shape=[samples,cell_number])
    test_mixture=np.empty(shape=[samples,gene])
    test_mixture_df=pd.DataFrame(index= signature_matrix.index)

    result_test=pd.DataFrame(index=range(samples),columns=signature_matrix.columns.values.tolist())
    score_data=np.empty(shape=[samples,1])
    for i in range(samples):
        test_data[i]=np.array(np.random.random(cell_number))
        test_data[i] /=np.sum(test_data[i])
        test_mixture[i]=np.matmul(test_data[i],signature_matrix.T)
        test_mixture_df[i]=pd.DataFrame(test_mixture[i],index= signature_matrix.index)

        test_mixture_df=test_mixture_df.fillna(0)
        #using synthetic mixture data in the model:
        result_test=Nu_SVR(test_mixture_df, signature_matrix)
        score_data[i]=r2_score(test_data[i],result_test.iloc[i])
    return score_data


In [7]:
def generate_test_data_score_LINEAR_SVR(samples,cell_number, gene,signature_matrix):
    '''
    This function gives the scores of randomly generated synthetic mixture data set for LINEAR_SVR model
       
       sample:  number of samples for synthetic mixture data
       cell_number: # number cell_types
       gene: number of genes in the signature matrix
       signature data : given signature matrix'''
    
    #generating synthetic mixture data 
    test_data=np.empty(shape=[samples,cell_number])
    test_mixture=np.empty(shape=[samples,gene])
    test_mixture_df=pd.DataFrame(index= signature_matrix.index)
    result_test=pd.DataFrame(index=range(samples),columns=signature_matrix.columns.values.tolist())
    score_data=np.empty(shape=[samples,1])
    for i in range(samples):
        test_data[i]=np.array(np.random.random(cell_number))
        test_data[i] /=np.sum(test_data[i])
        test_mixture[i]=np.matmul(test_data[i],signature_matrix.T)
        test_mixture_df[i]=pd.DataFrame(test_mixture[i],index= signature_matrix.index)
        test_mixture_df=test_mixture_df.fillna(0)
        #using synthetic mixture data in the model:
        result_test=Linear_SVR(test_mixture_df, signature_matrix,cell_number)
        
        score_data[i]=r2_score(test_data[i],result_test.iloc[i])
    return score_data


In [8]:
b=generate_test_data_score_LINEAR_SVR(100,8,355,signature_coarse)
average_score_l=np.mean(b)
average_score_l

0.9999999999999956

In [9]:
a=generate_test_data_score_NUSVR(100,8,355,signature_coarse)
average_score=np.mean(a)
average_score

0.9998813439572684