In [56]:
import warnings
warnings.filterwarnings("ignore")

In [57]:
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [58]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

In [59]:
tfidf = TfidfVectorizer(max_features=50, analyzer='char', ngram_range=(3, 3))  

In [60]:
gene1_vec_train = tfidf.fit_transform(dfTrain["GeneA"])
gene2_vec_train = tfidf.transform(dfTrain["GeneB"])

In [61]:
X = np.hstack([gene1_vec_train.toarray(), gene2_vec_train.toarray()])
y = dfTrain["Interaction"]

In [62]:
X.shape

(239664, 100)

In [63]:
y

0         1
1         0
2         0
3         0
4         0
         ..
239659    0
239660    0
239661    0
239662    0
239663    0
Name: Interaction, Length: 239664, dtype: int64

In [64]:
gene1_vec_test = tfidf.fit_transform(dfTest["GeneA"])
gene2_vec_test = tfidf.transform(dfTest["GeneB"])

In [65]:
X_test = np.hstack([gene1_vec_test.toarray(), gene2_vec_test.toarray()])
y_test = dfTest["Interaction"]

In [66]:
X_test.shape

(59916, 100)

In [67]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_resampled, y_resampled = smote.fit_resample(X, y)

In [68]:
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.89      0.81     40038
           1       0.63      0.37      0.46     19878

    accuracy                           0.72     59916
   macro avg       0.68      0.63      0.64     59916
weighted avg       0.70      0.72      0.69     59916



In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

rand_search.fit(X_resampled, y_resampled)