In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine



In [5]:
def data_prep(df):
    df_removed_0=df[df["Blank_pipe-APW.raw filtered Peak height"]==0]
    df_new=df_removed_0.drop(["row ID", "row m/z","row retention time",
                              'Gshallon_pipe-APW.raw filtered Peak height'], axis=1)
    final_df=pd.DataFrame()
    for column in df_new.columns:
        final_df[column.split('_')[0]]=df_new[column]
    final_df.drop('Blank',axis=1, inplace=True)
    final_df.index = range(len(final_df.index))
    return final_df 
    

In [6]:
def pair_count(w1, w2, data):
    columns = data.columns 
    count = 0 
    for column in columns:
        if data[column][w1] > 0 and data[column][w2]>0:
            count +=1 
    return count         

def compute_pmi(w1, w2, data):
    count_1 = pair_count(w1,w1,data)
    count_2 = pair_count(w2,w2,data)
    count_3 = pair_count(w1, w2,data)
    if count_2 == 0 or count_1 ==0: 
        return 0 
    else:
        pmi = count_3/(count_2*count_1)
        return pmi 
    
def all_pairs_pmi(data):
    data_index = data.index
    result = [[0]* len(data_index) for j in data_index]
    for w1 in data_index:
        for w2 in data_index:
            if w1 >= w2:
                result[w1][w2] = compute_pmi(w1, w2, data)   
            else:
                result[w1][w2] = result[w2][w1]
    return result

def all_pairs_pmi_new(data):
    data_index = data.index
    result = {}
    for w1 in data_index:
        d = {}
        for w2 in data_index:
            d[w2] = compute_pmi(w1, w2, data)           
            result[w1] = d
    return result

def compute_vector(column, data):
    corpus = data.index
    v = []
    for w1 in corpus:
        all_pmi = [compute_pmi(w1, w2, data) for w2 in corpus] 
        average_pmi = sum(all_pmi)/len(all_pmi)
        v.append(average_pmi)
    return v    


def compute_vectors(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
            if data[column][w1] > 0:
                average_pmi = sum(all_pmi)/len(all_pmi)
            else:
                average_pmi = 0 
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_vectors_fast(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            if data[column][w1] == 0:
                average_pmi = 0 
            else: 
                all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
                average_pmi = sum(all_pmi)/len(all_pmi)
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_consine_similarity(data):
    vector_for_samples = compute_vectors(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity

def compute_consine_similarity_fast(data):
    vector_for_samples = compute_vectors_fast(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity


# Result of BCP2

In [7]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP2.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 16min 15s
Wall time: 57min 9s


Unnamed: 0,score
Ceramic,1.0
Nquadrivalvis,0.135746
Ntabacum,0.133161
Nglauca,0.097428
Nattenuata,0.070354
Nobtusifolia,0.059016
Nrustica,0.029807
Aludoviciana,0.029285
Csericea,0.025329
Linflata,0.022706


# Result for BCP3

In [8]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP3.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 17min 44s
Wall time: 57min 10s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.117975
Ntabacum,0.113572
Nobtusifolia,0.105772
Nattenuata,0.077032
Nquadrivalvis,0.07266
Nrustica,0.071756
Aludoviciana,0.056563
Auvaursi,0.035951
Linflata,0.033604


# Result for BCP4

In [9]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP4.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 13min 28s
Wall time: 57min 2s


Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.135115
Nglauca,0.102993
Nquadrivalvis,0.080387
Nobtusifolia,0.064209
Nattenuata,0.052797
Auvaursi,0.042789
Nrustica,0.036892
Csericea,0.034252
Aludoviciana,0.029447


# Result for BCP5

In [10]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP5.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 17min 53s
Wall time: 57min 34s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.133324
Nquadrivalvis,0.079288
Nattenuata,0.070182
Ntabacum,0.062967
Nobtusifolia,0.057533
Aludoviciana,0.036116
Csericea,0.030801
Linflata,0.023274
Nrustica,0.022977


# Result for BCP16

In [12]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP6.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 13min 24s
Wall time: 1h 24s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.121477
Ntabacum,0.095137
Nquadrivalvis,0.06936
Linflata,0.06653
Nobtusifolia,0.064059
Aludoviciana,0.052901
Nattenuata,0.052133
Auvaursi,0.035291
Nrustica,0.033823


# Result for BCP7

In [13]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP7.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 16min 17s
Wall time: 59min 30s


Unnamed: 0,score
Ceramic,1.0
Auvaursi,0.111974
Csericea,0.101575
Ntabacum,0.053712
Nglauca,0.039147
AmericanSpirit,0.02282
Nquadrivalvis,0.020811
Rglabra,0.015901
Nrustica,0.015137
Linflata,0.014862


# Result for BCP8

In [14]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-BCP8.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 17min 49s
Wall time: 58min 7s


Unnamed: 0,score
Ceramic,1.0
Csericea,0.104628
Auvaursi,0.096767
AmericanSpirit,0.077298
Ntabacum,0.069541
Nglauca,0.06468
Linflata,0.032849
Nquadrivalvis,0.027861
Nobtusifolia,0.02608
Nattenuata,0.022582
