In [10]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
from sklearn import linear_model
# from surprise import SVD, Reader, Dataset
# from surprise.model_selection import train_test_split
# from surprise import accuracy, BaselineOnly
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [11]:
df = pd.read_csv('protein_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Gene A Sequence,Gene B Sequence,Interaction Type,Detection Method
0,MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPL...,MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...,Physical association,Two hybrid
1,MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPL...,MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...,Physical association,Pull down
2,MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPL...,MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...,Physical association,Enzyme linked immunosorbent assay
3,MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...,MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...,Direct interaction,Molecular sieving
4,MTAKMETTFYDDALNASFLPSESGPYGYSNPKILKQSMTLNLADPV...,MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...,Association,Anti bait coimmunoprecipitation


In [12]:
df = df.dropna()

In [13]:
counts = {}
mapping = {}
allData = []
for index, row in df.iterrows():
    aSeq = row['Gene A Sequence']
    bSeq = row['Gene B Sequence']

    counts[aSeq] = counts.get(aSeq, 0) + 1
    counts[bSeq] = counts.get(bSeq, 0) + 1

    if aSeq in mapping:
        mapping[aSeq].add(bSeq)
    else:
        mapping[aSeq] = {bSeq}

    if bSeq in mapping:
        mapping[bSeq].add(aSeq)
    else:
        mapping[bSeq] = {aSeq}
    
    allData.append((aSeq, bSeq, 1))

In [14]:
# splitting into test/train
nTrain = int(len(allData) * 0.9)
nTest = len(allData) - nTrain
interactionsTrain = allData[:nTrain]
interactionsTest = allData[nTrain:]

In [15]:
allGenes = set(df["Gene A Sequence"]).union(set(df["Gene B Sequence"]))

In [16]:
def getAllGenesInteracted(gene):
  return mapping[gene]


In [17]:
def getAllGenesNotInteracted(gene):
  genesInteracted = getAllGenesInteracted(gene)
  return allGenes - genesInteracted

In [18]:
negativeSamples = []

In [19]:
# takes around 4 minutes to negative sample
for geneA, geneB, _ in interactionsTrain:

  genesNotInteractedA = getAllGenesNotInteracted(geneA)
  genesNotInteractedB = getAllGenesNotInteracted(geneB)
  
  randomGeneA = random.choice(list(genesNotInteractedA))
  randomGeneB = random.choice(list(genesNotInteractedB))

  negativeSampleA = (geneA, randomGeneA, 0)
  negativeSampleB = (geneB, randomGeneB, 0)
  
  negativeSamples.append(negativeSampleA)
  negativeSamples.append(negativeSampleB)


In [20]:
negativeSamples[:10]

[('MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPLFGMEDDMMEFTPSLGIQTLKLQYKCVVNINAINPFRDFREAISAMQFWEADYSGYIGKKPFYRAIILHTARQLKTSNPGILDRGVVEYHATTQGRALVFHSLGPSPSMMFVPETFTREWNILTNKGTINVKIWLGETDTLSELEPILNPVNFRDDREMIEGAAIMGLEIKKQKDNTWLISKSH',
  'MAVALAAAAGKLRRAIGRSCPWQPFSTEPGPPHGAAVRDAFLSFFRDRHGHRLVPSATVRPRGDPSLLFVNAGMNQFKPIFLGTVDPRSEMAGFRRVVNSQKCVRAGGRHNDLEDVGRDLSHHTFFEMLGNWAFGGEYFKEEACSMAWELLTQVYGIPEDRLWVSYFSGDSQTGLDPDLETRDIWLSLGVPASRVLSFGPQENFWEMGDTGPCGPCTEIHYDLAGGVGSPQLVELWNLVFMQHYREADGSLQLLPQRHVDTGMGLERLVAVLQGKRSTYDTDLFSPLLDAIHQSCGAPPYSGRVGAADEGRIDTAYRVVADHIRTLSVCIADGVSPGMSGAPLVLRRILRRAVRYSTEVLQAPPGFLGSLVPVVVETLGSAYPELEKNSVKIASLVSEDEAAFLASLQRGRRIIDRTVKRLGPSDLFPAEVAWSLSLSGNLGIPLDLVELMLEEKGVKLDTAGLEQLAQKEAQHRAQQAEADQEDRLCLDVHALEELHRQGIPTTDDSPKYNYTLHPNGDYEFGLCEARVLQLYSETGTAVASVGAGQRCGLLLDRTNFYAEQGGQASDRGYLVRTGQQDMLFPVAGAQLCGGFILHEAMAPERLQVGDQVQLYVDKAWRMGCMVKHTATHLLSWALRQTLGPTTEQRGSHLNPERLRFDVATQTLLTTEQLRTVESYVQEVVGQDKPVFMEEVPLAHTARIPGLRSLDEVYPDPVRVVSVGVPVAHALGPASQAAMHTSVELCCGTHLLSTGAVGDLVIIG

In [21]:
interactionsTrain = interactionsTrain + negativeSamples

In [22]:
dfTrain = pd.DataFrame(interactionsTrain, columns=["GeneA", "GeneB", "Interaction"])
dfTest = pd.DataFrame(interactionsTest, columns=["GeneA", "GeneB", "Interaction"])

dfTrain.to_csv('train.csv', index=False)
dfTest.to_csv('test.csv', index=False)


In [23]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

In [24]:
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3, 3))  # Using 3-mers

In [25]:
gene1_vec_train = tfidf.fit_transform(dfTrain["GeneA"])
gene2_vec_train = tfidf.transform(dfTrain["GeneB"])

In [26]:
X = np.hstack([gene1_vec_train.toarray(), gene2_vec_train.toarray()])
y = dfTrain["Interaction"]

In [27]:
gene1_vec_test = tfidf.fit_transform(dfTest["GeneA"])
gene2_vec_test = tfidf.transform(dfTest["GeneB"])

In [28]:
X_test = np.hstack([gene1_vec_test.toarray(), gene2_vec_test.toarray()])
y_test = dfTest["Interaction"]

In [29]:
# taking a long time to predict
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_proba))