In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import MiniBatchNMF
from pathlib import Path

In [2]:
pd.set_option('display.expand_frame_repr', False)

In [3]:
df = pd.read_parquet("data/df_nes.parquet")

In [4]:
df.head(2)

Unnamed: 0,sequence,input_count,rep1_count,rep2_count,log2_fold_change,CV,NES,VNES
0,CDNRVESEC,2,361,330,7.432542,22.290323,8.535944,0.382944
1,CGNNVWDLC,2,321,360,7.411511,17.461538,8.511791,0.487459


In [5]:
df.shape

(45244632, 8)

In [6]:
df = df.sample(1000000)

In [7]:
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
one_hot_dict = {}
for i, aa in enumerate(amino_acids):
    encoding = [0] * len(amino_acids)
    encoding[i] = 1
    one_hot_dict[aa] = encoding

In [8]:
one_hot_dict

{'A': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'C': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'D': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'E': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'F': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'G': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'H': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'I': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'K': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'L': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'M': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'N': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 'P': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 'Q': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 'R': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
df.shape

(1000000, 8)

In [10]:
def encode_sequence(sequence, one_hot_dict):
    encoding = np.zeros((len(sequence), len(amino_acids)))
    for i, aa in enumerate(sequence):
        encoding[i] = one_hot_dict.get(aa, [0] * len(amino_acids))
    return encoding.flatten()

df["OHE"] = df["sequence"].apply(lambda seq: encode_sequence(seq, one_hot_dict))

In [11]:
df.head(2)

Unnamed: 0,sequence,input_count,rep1_count,rep2_count,log2_fold_change,CV,NES,VNES,OHE
40018119,CQVLNKDQC,4,2,5,-0.192645,2.333333,-0.221244,-0.094819,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
38529993,CTGGGFNRC,7,1,6,-1.0,1.4,-1.148456,-0.820325,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
df.reset_index(inplace=True)

In [13]:
df["OHE"][0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
df.to_parquet("data/df_nes_ohe_1000000.parquet")

In [15]:
df.dtypes

index                 int64
sequence             object
input_count           int64
rep1_count            int64
rep2_count            int64
log2_fold_change    float64
CV                  float64
NES                 float64
VNES                float64
OHE                  object
dtype: object

In [16]:
# Define the NMF model
n_components = 1 # set the number of components to use
nmf = MiniBatchNMF(n_components=n_components, random_state=0)
W = nmf.fit_transform(np.vstack(df["OHE"].values))

In [17]:
W.shape

(1000000, 1)

In [18]:
W.squeeze(-1).shape

(1000000,)

In [19]:
nmf.reconstruction_err_

2568.1614028263434

In [20]:
2568/(1000000**2)

2.568e-09

In [21]:
df["COHE"] = W.squeeze(-1).tolist()

In [22]:
df.head(2)

Unnamed: 0,index,sequence,input_count,rep1_count,rep2_count,log2_fold_change,CV,NES,VNES,OHE,COHE
0,40018119,CQVLNKDQC,4,2,5,-0.192645,2.333333,-0.221244,-0.094819,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.038567
1,38529993,CTGGGFNRC,7,1,6,-1.0,1.4,-1.148456,-0.820325,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.04098


In [23]:
df.to_parquet("data/df_nes_ohe_cohe_1000000.parquet")

In [24]:
df.describe()

Unnamed: 0,index,input_count,rep1_count,rep2_count,log2_fold_change,CV,NES,VNES,COHE
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,22600390.0,3.80375,5.14048,5.251291,0.497538,4.162813,0.571401,0.203724,0.039298
std,13058890.0,5.002988,7.722459,7.952456,0.870886,5.490293,1.000173,0.448041,0.002072
min,119.0,1.0,1.0,1.0,-5.169925,1.051282,-5.937429,-2.614679,0.005184
25%,11287200.0,1.0,1.0,1.0,0.222392,2.333333,0.255408,0.036487,0.038668
50%,22593760.0,1.0,2.0,2.0,0.584963,3.0,0.671803,0.223934,0.039461
75%,33921690.0,5.0,6.0,6.0,0.784271,3.0,0.900701,0.223934,0.040251
max,45244630.0,120.0,658.0,701.0,5.918863,335.0,6.797552,4.0818,0.044168
