In [26]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.model_selection import train_test_split

In [28]:
data = pd.read_csv('protein_data.csv')
data = data.dropna()
encoder = LabelEncoder()
data['Interaction'] = encoder.fit_transform(data['Interaction Type'])
labels = np.array(data['Interaction'])

# Remove minorities
unique_labels, counts = np.unique(labels, return_counts=True)
label_counts = dict(zip(unique_labels, counts))
threshold = 1000
label_mapping = {label: -1 if count < threshold else label for label, count in label_counts.items()}
# Relabel the dataset
labels_relabel = [label_mapping[label] for label in labels]
labels = labels_relabel

X = data['Gene A Sequence'] + ' ' + data['Gene B Sequence']
y = labels

X.head(), y

(0    MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPL...
 1    MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPL...
 2    MQRLKKFIAKREKGDKGKMKWNSSMDYDSPPSYQDVRRGIFPTAPL...
 3    MSSTLAKIAEIEAEMARTQKNKATAHHLGLLKARLAKLRRELITPK...
 4    MTAKMETTFYDDALNASFLPSESGPYGYSNPKILKQSMTLNLADPV...
 dtype: object,
 [23,
  23,
  23,
  13,
  4,
  23,
  23,
  4,
  23,
  23,
  4,
  4,
  26,
  26,
  26,
  4,
  13,
  13,
  13,
  26,
  23,
  4,
  26,
  4,
  26,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  23,
  23,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  4,
  23,
  23,
  23,
  6,
  13,
  13,
  23,
  23,
  23,
  4,
  4,
  4,
  23,
  23,
  26,
  4,
  26,
  23,
  13,
  13,
  13,
  4,
  4,
  -1,
  -1,
  23,
  23,
  4,
  23,
  23,
  23,
  4,
  23,
  13,
  4,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
  23,
 

In [29]:
tfidf = TfidfVectorizer(max_features=50, analyzer='char', ngram_range=(3, 3))  

In [30]:
X = tfidf.fit_transform(X)


In [31]:
X

<99860x50 sparse matrix of type '<class 'numpy.float64'>'
	with 2302258 stored elements in Compressed Sparse Row format>

In [32]:
X.shape

(99860, 50)

In [33]:
y

[23,
 23,
 23,
 13,
 4,
 23,
 23,
 4,
 23,
 23,
 4,
 4,
 26,
 26,
 26,
 4,
 13,
 13,
 13,
 26,
 23,
 4,
 26,
 4,
 26,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 23,
 23,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 4,
 23,
 23,
 23,
 6,
 13,
 13,
 23,
 23,
 23,
 4,
 4,
 4,
 23,
 23,
 26,
 4,
 26,
 23,
 13,
 13,
 13,
 4,
 4,
 -1,
 -1,
 23,
 23,
 4,
 23,
 23,
 23,
 4,
 23,
 13,
 4,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 6,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 23,
 4,
 26,
 4,
 4,
 26,
 23,
 23,
 4,
 26,
 4,
 26,
 13,
 23,
 4,
 23,
 4,
 4,

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
X_test, y_test

(<19972x50 sparse matrix of type '<class 'numpy.float64'>'
 	with 459600 stored elements in Compressed Sparse Row format>,
 [4,
  4,
  6,
  23,
  23,
  4,
  26,
  26,
  23,
  4,
  13,
  4,
  -1,
  4,
  4,
  4,
  4,
  23,
  23,
  23,
  4,
  23,
  23,
  4,
  4,
  23,
  4,
  23,
  4,
  23,
  4,
  4,
  4,
  23,
  4,
  13,
  4,
  23,
  4,
  23,
  23,
  4,
  23,
  4,
  23,
  23,
  13,
  23,
  4,
  23,
  23,
  23,
  -1,
  4,
  23,
  4,
  4,
  4,
  23,
  6,
  23,
  4,
  23,
  4,
  23,
  26,
  23,
  4,
  4,
  23,
  23,
  13,
  23,
  4,
  13,
  23,
  23,
  23,
  4,
  4,
  26,
  4,
  23,
  4,
  4,
  13,
  23,
  4,
  4,
  23,
  4,
  4,
  4,
  23,
  23,
  23,
  23,
  23,
  4,
  4,
  23,
  4,
  23,
  23,
  4,
  23,
  23,
  23,
  4,
  23,
  23,
  23,
  23,
  23,
  4,
  23,
  4,
  23,
  23,
  23,
  23,
  4,
  13,
  4,
  23,
  23,
  4,
  23,
  13,
  23,
  23,
  4,
  23,
  4,
  23,
  -1,
  23,
  4,
  4,
  23,
  23,
  26,
  23,
  23,
  23,
  23,
  13,
  23,
  23,
  4,
  23,
  23,
  4,
  4,
  4,
  13,
  4

In [36]:
X_test.shape, len(y_test)

((19972, 50), 19972)

In [38]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [39]:
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

          -1       0.48      0.32      0.39       259
           4       0.77      0.71      0.74      7654
           6       0.27      0.23      0.24       562
          13       0.43      0.45      0.44      1200
          23       0.72      0.78      0.75      9250
          26       0.70      0.68      0.69      1047

    accuracy                           0.71     19972
   macro avg       0.56      0.53      0.54     19972
weighted avg       0.71      0.71      0.70     19972



In [41]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

rand_search.fit(X_resampled, y_resampled)

KeyboardInterrupt: 

In [47]:
# Train the logistic regression model
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.03      0.35      0.05       259
           4       0.55      0.30      0.39      7654
           6       0.05      0.25      0.08       562
          13       0.11      0.29      0.15      1200
          23       0.60      0.21      0.32      9250
          26       0.16      0.48      0.24      1047

    accuracy                           0.27     19972
   macro avg       0.25      0.31      0.20     19972
weighted avg       0.50      0.27      0.32     19972

