In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [4]:
dfTrain = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')

In [5]:
tfidf = TfidfVectorizer(max_features=50, analyzer='char', ngram_range=(3, 3))  

In [6]:
gene1_vec_train = tfidf.fit_transform(dfTrain["GeneA"])
gene2_vec_train = tfidf.transform(dfTrain["GeneB"])

In [7]:
X = np.hstack([gene1_vec_train.toarray(), gene2_vec_train.toarray()])
y = dfTrain["Interaction"]

In [8]:
X.shape

(213991, 100)

In [9]:
y

0         0
1         0
2         0
3         1
4         1
         ..
213986    0
213987    0
213988    0
213989    0
213990    0
Name: Interaction, Length: 213991, dtype: int64

In [10]:
gene1_vec_test = tfidf.fit_transform(dfTest["GeneA"])
gene2_vec_test = tfidf.transform(dfTest["GeneB"])

In [11]:
X_test = np.hstack([gene1_vec_test.toarray(), gene2_vec_test.toarray()])
y_test = dfTest["Interaction"]

In [12]:
X_test.shape

(53498, 100)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

lr_classifier = LogisticRegression(
    max_iter=2000,          
    C=1.0,
    solver='saga',
    random_state=42                  
)

lr_classifier.fit(X, y)

# Make predictions
y_pred = lr_classifier.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.87      0.83     35733
           1       0.68      0.56      0.62     17765

    accuracy                           0.77     53498
   macro avg       0.74      0.72      0.72     53498
weighted avg       0.76      0.77      0.76     53498



In [14]:
def calcConfusionMatrix(y, yPred):
    tnfp, fntp = confusion_matrix(y, yPred)
    tn, fp = tnfp
    fn, tp = fntp

    # tp: number of times model predicts male correctly
    # tn: number of times model predicts female correctly

    # fp: number of times model predicts male when female
    # fn: number of times model predicts female when male

    # balanced ber rate:
    fpRate = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnRate = fn / (fn + tp) if (fn + tp) > 0 else 0
    ber = (fpRate + fnRate) / 2

    return [tp, tn, fp, fn, ber]

In [16]:
tp, tn, fp, fn, ber = calcConfusionMatrix(y_test, y_pred)

print(f"True P: {tp}")
print(f"True N: {tn}")
print(f"False P: {fp}")
print(f"False N: {fn}")
print(f"Ber: {ber}")

True P: 9969
True N: 31084
False P: 4649
False N: 7796
Ber: 0.28447212107239145
