In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [3]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

In [4]:
tfidf = TfidfVectorizer(max_features=50, analyzer='char', ngram_range=(3, 3))  

In [5]:
gene1_vec_train = tfidf.fit_transform(dfTrain["GeneA"])
gene2_vec_train = tfidf.transform(dfTrain["GeneB"])

In [6]:
X = np.hstack([gene1_vec_train.toarray(), gene2_vec_train.toarray()])
y = dfTrain["Interaction"]

In [7]:
X.shape

(213991, 100)

In [8]:
y

0         0
1         0
2         0
3         1
4         1
         ..
213986    0
213987    0
213988    0
213989    0
213990    0
Name: Interaction, Length: 213991, dtype: int64

In [9]:
gene1_vec_test = tfidf.fit_transform(dfTest["GeneA"])
gene2_vec_test = tfidf.transform(dfTest["GeneB"])

In [10]:
X_test = np.hstack([gene1_vec_test.toarray(), gene2_vec_test.toarray()])
y_test = dfTest["Interaction"]

In [11]:
X_test.shape

(53498, 100)

In [12]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_resampled, y_resampled = smote.fit_resample(X, y)

In [13]:
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.32      0.48     35733
           1       0.42      0.98      0.59     17765

    accuracy                           0.54     53498
   macro avg       0.69      0.65      0.53     53498
weighted avg       0.79      0.54      0.51     53498



In [14]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

rand_search.fit(X_resampled, y_resampled)

KeyboardInterrupt: 