# Read in Support Values by Gene Family

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
unordered_pivot = pd.read_csv('unordered_pivot.csv',index_col=0)

# Perform Permutation Test on Gene Family Orderings

In [3]:
def permute_rows(A):
    return np.random.permutation(A)

def permute_cols(A):
    return np.random.permutation(A.T).T

def check_has_window(R,n,k,t):
    cols = len(R)
    c = 0
    while c+n <= cols:
        if np.sum(R[c:c+n] >= t) >= k:
            return True
        c += 1
    return False

def find_grouped_transfers(A,n,k,t,print_rows=False):
    """
    A: (r x c) matrix
    Find the number of rows with at least one window of width n<=c with at least k transfers with at least 2 bootstrap support
    """
    found_indices = []
    
    rows, cols = A.shape
    num_rows_found = 0
    for r in range(rows):
        if check_has_window(A[r,:],n,k,t):
            if print_rows: print(r, A[r,:])
            found_indices.append(r)
    return found_indices

In [4]:
n = 2
k = 2
t = 100
DF = unordered_pivot
A = DF.iloc[:,:-1].values
NUM_BOOTS = 500000
preshuffle_rows = False

vals = []
found_random = 0
checked_random = 0
for _ in tqdm(range(NUM_BOOTS)):
    if preshuffle_rows:
        permuted = np.random.permutation(A.ravel()).reshape(A.shape)
    else:
        permuted = permute_cols(A)
    rand_row = permuted[np.random.choice(permuted.shape[0]),:]
    if np.sum(rand_row >= t) >= k:
        checked_random += 1

    if check_has_window(rand_row,n,k,t):
        found_random += 1

print(f"Out of {checked_random} randomly generated rows with {k} significant transfers, {found_random} had at least one window \nof size {n} with {k} transfers with greater than {t} support (p={found_random / checked_random})\n")

found_indices = find_grouped_transfers(A,n,k,t,print_rows=False)
print(f"There are {len(found_indices)} such rows (out of {((A >= t).sum(axis=1) >= k).sum()} with at least {k} significant transfers):")

from scipy.stats import binom_test
print(f"Binomial Test: {binom_test(len(found_indices), ((A >= t).sum(axis=1) >= k).sum() ,(found_random / checked_random),'greater')}")

  0%|          | 0/500000 [00:00<?, ?it/s]

Out of 50733 randomly generated rows with 2 significant transfers, 14794 had at least one window 
of size 2 with 2 transfers with greater than 100 support (p=0.291605069678513)

There are 39 such rows (out of 85 with at least 2 significant transfers):
Binomial Test: 0.0008023747777268061


In [6]:
DF.iloc[found_indices,:]

Unnamed: 0,ORF1ab,S,ORF3a,E,M,ORF6,ORF7a,ORF7b,ORF8,N,ORF10,Total
CpY11+LYRa11,0.0,0.0,0.0,60.0,123.0,1000.0,1000.0,200.0,863.0,0.0,0.0,3246.0
RpS11+n44,0.0,0.0,900.0,100.0,0.0,1000.0,0.0,800.0,15.0,0.0,0.0,2815.0
HeB2013+Jiyuan_84,1000.0,1000.0,0.0,0.0,0.0,0.0,151.0,0.0,46.0,500.0,65.0,2762.0
YNLF_31C+n40,0.0,900.0,900.0,0.0,672.0,0.0,0.0,0.0,0.0,0.0,0.0,2472.0
Rs7327+n14,0.0,0.0,0.0,0.0,600.0,767.0,0.0,0.0,0.0,600.0,0.0,1967.0
As6526+Rs4255,0.0,0.0,0.0,0.0,0.0,200.0,0.0,0.0,856.0,822.0,0.0,1878.0
GX2013+n44,0.0,1000.0,100.0,0.0,0.0,0.0,0.0,440.0,0.0,0.0,0.0,1540.0
Rs7327+Rs9401,240.0,777.0,388.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1505.0
n2+n46,0.0,28.0,100.0,400.0,200.0,0.0,300.0,400.0,0.0,0.0,0.0,1428.0
Rs4231+n21,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,276.0,0.0,1376.0
