In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine



In [4]:
def data_prep(df):
    df_removed_0=df[df["Blank_pipe-APW.raw filtered Peak height"]==0]
    df_new=df_removed_0.drop(["row ID", "row m/z","row retention time"], axis=1)
    final_df=pd.DataFrame()
    for column in df_new.columns:
        final_df[column.split('_')[0]]=df_new[column]
    final_df.drop('Blank',axis=1, inplace=True)
    final_df.index = range(len(final_df.index))
    return final_df 
    

In [5]:
def pair_count(w1, w2, data):
    columns = data.columns 
    count = 0 
    for column in columns:
        if data[column][w1] > 0 and data[column][w2]>0:
            count +=1 
    return count         

def compute_pmi(w1, w2, data):
    count_1 = pair_count(w1,w1,data)
    count_2 = pair_count(w2,w2,data)
    count_3 = pair_count(w1, w2,data)
    if count_2 == 0 or count_1 ==0: 
        return 0 
    else:
        pmi = count_3/(count_2*count_1)
        return pmi 
    
def all_pairs_pmi(data):
    data_index = data.index
    result = [[0]* len(data_index) for j in data_index]
    for w1 in data_index:
        for w2 in data_index:
            if w1 >= w2:
                result[w1][w2] = compute_pmi(w1, w2, data)   
            else:
                result[w1][w2] = result[w2][w1]
    return result

def all_pairs_pmi_new(data):
    data_index = data.index
    result = {}
    for w1 in data_index:
        d = {}
        for w2 in data_index:
            d[w2] = compute_pmi(w1, w2, data)           
            result[w1] = d
    return result

def compute_vector(column, data):
    corpus = data.index
    v = []
    for w1 in corpus:
        all_pmi = [compute_pmi(w1, w2, data) for w2 in corpus] 
        average_pmi = sum(all_pmi)/len(all_pmi)
        v.append(average_pmi)
    return v    


def compute_vectors(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
            if data[column][w1] > 0:
                average_pmi = sum(all_pmi)/len(all_pmi)
            else:
                average_pmi = 0 
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_vectors_fast(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            if data[column][w1] == 0:
                average_pmi = 0 
            else: 
                all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
                average_pmi = sum(all_pmi)/len(all_pmi)
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_consine_similarity(data):
    vector_for_samples = compute_vectors(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity

def compute_consine_similarity_fast(data):
    vector_for_samples = compute_vectors_fast(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity


# Result of CP1

In [6]:
cp1=pd.read_csv("Blind_Pipes-APW-CP1.csv")
data =data_prep(cp1)


In [42]:
%%time
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 37min 15s, sys: 19.3 s, total: 1h 37min 35s
Wall time: 1h 40min 41s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.110715
Ntabacum,0.097221
Nquadrivalvis,0.063363
Csericea,0.043515
Nobtusifolia,0.040696
AmericanSpirit,0.03492
Nrustica,0.02732
Nattenuata,0.025598
Auvaursi,0.023604


# Result of CP2


In [43]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP2.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 36min 22s, sys: 19.1 s, total: 1h 36min 41s
Wall time: 1h 39min 35s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.111924
Ntabacum,0.109833
Nquadrivalvis,0.065858
Csericea,0.051812
Nobtusifolia,0.048446
AmericanSpirit,0.041034
Linflata,0.032915
Nattenuata,0.028831
Nrustica,0.022511


# Result for CP3

In [44]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP3.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 35min 32s, sys: 17.9 s, total: 1h 35min 49s
Wall time: 1h 38min 59s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.096013
AmericanSpirit,0.062255
Ntabacum,0.060697
Nquadrivalvis,0.035138
Csericea,0.032393
Auvaursi,0.027184
Linflata,0.025329
Nobtusifolia,0.019011
Nattenuata,0.01465


# Result for CP4

In [45]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP4.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 35min 11s, sys: 18.4 s, total: 1h 35min 29s
Wall time: 1h 38min 31s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.108143
Ntabacum,0.103681
Nquadrivalvis,0.06252
Csericea,0.05509
AmericanSpirit,0.050166
Nobtusifolia,0.048554
Linflata,0.037154
Nattenuata,0.026602
Nrustica,0.018161


# Result for CP5

In [46]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP5.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 35min 58s, sys: 18.2 s, total: 1h 36min 16s
Wall time: 1h 38min 59s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.146243
Ntabacum,0.131216
Nquadrivalvis,0.087143
AmericanSpirit,0.061309
Nobtusifolia,0.050621
Csericea,0.049816
Nrustica,0.030892
Nattenuata,0.030567
Linflata,0.025057


# Result for CP6

In [47]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP6.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 36min 40s, sys: 18.2 s, total: 1h 36min 59s
Wall time: 1h 40min 16s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.141733
Ntabacum,0.109194
Linflata,0.071387
Csericea,0.069245
Nquadrivalvis,0.056523
AmericanSpirit,0.046649
Nobtusifolia,0.037754
Gshallon,0.03406
Nrustica,0.033902


# Result for CP7

In [48]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP7.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 48min 1s, sys: 22.9 s, total: 1h 48min 24s
Wall time: 3h 41min 10s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.069743
Ntabacum,0.068984
Csericea,0.043923
Nquadrivalvis,0.037862
AmericanSpirit,0.035512
Nattenuata,0.026591
Auvaursi,0.021797
Nobtusifolia,0.018528
Nrustica,0.018375


# Result for CP8

In [50]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP8.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 2h 3min 46s, sys: 33.1 s, total: 2h 4min 19s
Wall time: 4h 14min 4s


Unnamed: 0,score
Ceramic,1.0
Csericea,0.13767
Ntabacum,0.084632
Nglauca,0.084126
Auvaursi,0.061551
AmericanSpirit,0.043044
Nquadrivalvis,0.034902
Nobtusifolia,0.025718
Linflata,0.024108
Nattenuata,0.0197


# Result for CP9

In [51]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP9.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 2h 15min 15s, sys: 40.1 s, total: 2h 15min 55s
Wall time: 4h 37min 31s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.083895
Ntabacum,0.07508
Nquadrivalvis,0.049139
Nattenuata,0.047976
AmericanSpirit,0.035689
Nrustica,0.028298
Nobtusifolia,0.026031
Csericea,0.021418
Auvaursi,0.017903


# Result for CP10

In [6]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP10.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 2h 12min 50s, sys: 36.1 s, total: 2h 13min 26s
Wall time: 4h 33min 22s


Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.09571
Nglauca,0.074948
Nobtusifolia,0.05873
Nquadrivalvis,0.05649
Auvaursi,0.035171
Nattenuata,0.032629
AmericanSpirit,0.030637
Csericea,0.023064
Aludoviciana,0.019357
