In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
def readCSV(path):
    with open(path, 'r') as f:
        f.readline()
        for l in f:
            a, b, i = l.strip().split(',')
            i = int(i)
            yield a, b, i

In [3]:
train = []
for l in readCSV("train.csv"):
    train.append(l)

In [4]:
geneCount = defaultdict(int)
totalInteractions = 0

for geneA, geneB, interaction in readCSV("train.csv"):
    if interaction == 0: continue
    geneCount[geneA] += 1
    geneCount[geneB] += 1
    totalInteractions += 1

mostPopular = [(geneCount[x], x) for x in geneCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalInteractions/2:
        break

In [5]:
test = []
for l in readCSV("test.csv"):
    test.append(l)

In [6]:
def getAccuracy(yPred, yTrue):
    total = len(yPred)
    correct = 0
    for i in range(total):
        if yPred[i] == yTrue[i]:
            correct += 1

    return correct / total

In [7]:
counts = {}
mapping = {}

for geneA, geneB, interaction in train:

    counts[geneA] = counts.get(geneA, 0) + 1
    counts[geneB] = counts.get(geneB, 0) + 1

    if geneA in mapping:

        if interaction == 1:
            mapping[geneA].add(geneB)
    else:
        if interaction == 1:
            mapping[geneA] = {geneB}

    if geneB in mapping:
        if interaction == 1:
            mapping[geneB].add(geneA)
    else:
        if interaction == 1:
            mapping[geneB] = {geneA}

In [8]:
def getGenesInteractedWith(gene):
  genes = set()
  if gene in mapping: genes = mapping[gene]
  return genes

In [9]:
def Jaccard(geneA, geneB):

    if geneA not in mapping or geneB not in mapping:
        return 0

    set1 = mapping[geneA]
    set2 = mapping[geneB]

    # Calculate Jaccard similarity
    numer = len(set1.intersection(set2))
    denom = len(set1.union(set2))
    return numer / denom if denom > 0 else 0

In [10]:
yTrue = []
for geneA, geneB, interaction in test:
    yTrue.append(interaction)

In [11]:
def maxJaccard(threshold):
    totalPredictions = len(test)
    correct = 0
    for geneA, geneB, interaction in test:

        genesInteractedWith = getGenesInteractedWith(geneA)
        maxJaccard = 0

        # Calculate maximum Jaccard similarity for books user read
        for gene in genesInteractedWith:
            if gene == geneB: continue
            jacc = Jaccard(gene, geneB)
            maxJaccard = max(jacc, maxJaccard)

        # Predict read if max Jaccard exceeds threshold
        # print(maxJaccard)
        predicted = maxJaccard > threshold
        correct += (predicted == interaction)
    # Calculate and return accuracy
    return correct / totalPredictions


In [12]:
bestAccuracy = float("-inf")
thresholds = [.1, .2, .3, .4, .5]
for threshold in thresholds:
    accuracy = maxJaccard(threshold)
    print(f"Threshold {threshold} -> Accuracy: {accuracy:.2f}")
    
    bestAccuracy = max(bestAccuracy, accuracy)

print(f"Best accuracy: {bestAccuracy:.2f}")

Threshold 0.1 -> Accuracy: 0.83
Threshold 0.2 -> Accuracy: 0.78
Threshold 0.3 -> Accuracy: 0.73
Threshold 0.4 -> Accuracy: 0.70
Threshold 0.5 -> Accuracy: 0.68
Best accuracy: 0.83
