In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
features = pd.read_csv('/home/xiaome6/gnn-protein/sda-patch-gnn/data/ghesquire_2011/Ghesquiere2011_Met.csv').dropna(subset=['sequence'])

In [None]:
features = features[['sequence', '%ox_fwd']]

In [None]:
alphabet = "ACDEFGHIKLMNPQRSTVWY-"
oh_matrix = np.eye(len(alphabet))
oh_mapping = {l: arr for l, arr in zip(alphabet, oh_matrix)}

In [None]:
maxlen_proteins = features["sequence"].str.len().max()

In [None]:
def encoder(protein_seq: str) -> np.ndarray:
    encoded_array = list()
    for char in protein_seq:
        encoded_array.append(oh_mapping[char])
    return np.concatenate(encoded_array)

In [None]:
def padding(protein_seq: str, length: int) -> str:
    padd_len = length - len(protein_seq)
    return protein_seq + "".join("-" for i in range(padd_len))

In [None]:
padded_proteins = list(padding(seq, maxlen_proteins) for seq in features['sequence'] )
features = features.assign(padded_sequence=padded_proteins)

In [None]:
encoded_features = np.vstack(features["padded_sequence"].apply(encoder))
encoded_features.shape

In [None]:
from scipy.special import logit
labels = logit(np.clip(np.array(features['%ox_fwd']) / 100, 0.01, 0.99))

In [None]:
plt.hist(labels)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(encoded_features, labels, test_size = 0.25, random_state = 42)

In [None]:
baseline_preds = test_labels.mean()

In [None]:
baseline_errors = abs(baseline_preds - test_labels)

In [None]:
rf = RandomForestRegressor(n_estimators=300, random_state=42)

In [None]:
rf.fit(train_features, train_labels)

In [None]:
predictions = rf.predict(test_features)

In [None]:
errors = abs(predictions - test_labels)

In [None]:
fig, ax = plt.subplots()
ax.scatter(predictions, test_labels, alpha=0.1)
ax.set_aspect("equal")
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)