In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine



In [4]:
def data_prep(df):
    df_removed_0=df[df["Blank_pipe-APW.raw filtered Peak height"]==0]
    df_new=df_removed_0.drop(["row ID", "row m/z","row retention time"], axis=1)
    final_df=pd.DataFrame()
    for column in df_new.columns:
        final_df[column.split('_')[0]]=df_new[column]
    final_df.drop('Blank',axis=1, inplace=True)
    final_df.index = range(len(final_df.index))
    return final_df 
    

In [5]:
def pair_count(w1, w2, data):
    columns = data.columns 
    count = 0 
    for column in columns:
        if data[column][w1] > 0 and data[column][w2]>0:
            count +=1 
    return count         

def compute_pmi(w1, w2, data):
    count_1 = pair_count(w1,w1,data)
    count_2 = pair_count(w2,w2,data)
    count_3 = pair_count(w1, w2,data)
    if count_2 == 0 or count_1 ==0: 
        return 0 
    else:
        pmi = count_3/(count_2*count_1)
        return pmi 
    
def all_pairs_pmi(data):
    data_index = data.index
    result = [[0]* len(data_index) for j in data_index]
    for w1 in data_index:
        for w2 in data_index:
            if w1 >= w2:
                result[w1][w2] = compute_pmi(w1, w2, data)   
            else:
                result[w1][w2] = result[w2][w1]
    return result

def all_pairs_pmi_new(data):
    data_index = data.index
    result = {}
    for w1 in data_index:
        d = {}
        for w2 in data_index:
            d[w2] = compute_pmi(w1, w2, data)           
            result[w1] = d
    return result

def compute_vector(column, data):
    corpus = data.index
    v = []
    for w1 in corpus:
        all_pmi = [compute_pmi(w1, w2, data) for w2 in corpus] 
        average_pmi = sum(all_pmi)/len(all_pmi)
        v.append(average_pmi)
    return v    


def compute_vectors(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
            if data[column][w1] > 0:
                average_pmi = sum(all_pmi)/len(all_pmi)
            else:
                average_pmi = 0 
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_vectors_fast(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            if data[column][w1] == 0:
                average_pmi = 0 
            else: 
                all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
                average_pmi = sum(all_pmi)/len(all_pmi)
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_consine_similarity(data):
    vector_for_samples = compute_vectors(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity

def compute_consine_similarity_fast(data):
    vector_for_samples = compute_vectors_fast(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity


# Result of CP11

In [5]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP11.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 46min 5s, sys: 22.8 s, total: 1h 46min 28s
Wall time: 3h 38min 52s


Unnamed: 0,score
Ceramic,1.0
Nquadrivalvis,0.132261
Ntabacum,0.130071
Nglauca,0.099191
Nattenuata,0.069572
Nobtusifolia,0.057996
Aludoviciana,0.028659
Nrustica,0.028394
Csericea,0.026486
AmericanSpirit,0.021152


# Result for CP12

In [6]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP12.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 53min 16s, sys: 32.7 s, total: 1h 53min 48s
Wall time: 3h 54min 43s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.119484
Ntabacum,0.112775
Nobtusifolia,0.105134
Nattenuata,0.07687
Nquadrivalvis,0.071562
Nrustica,0.070633
Aludoviciana,0.052675
Auvaursi,0.035129
Linflata,0.032037


# Result for CP13

In [7]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP13.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 2h 1min 42s, sys: 32.8 s, total: 2h 2min 15s
Wall time: 4h 11min 48s


Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.137327
Nglauca,0.10237
Nquadrivalvis,0.080251
Nobtusifolia,0.064591
Nattenuata,0.05312
Auvaursi,0.041788
Csericea,0.037052
Nrustica,0.036299
Aludoviciana,0.027956


# Result for CP14

In [8]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP14.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 2h 13min 54s, sys: 40.2 s, total: 2h 14min 34s
Wall time: 4h 36min 8s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.132508
Nquadrivalvis,0.078617
Nattenuata,0.069518
Ntabacum,0.063891
Nobtusifolia,0.057036
Aludoviciana,0.036168
Csericea,0.034172
Linflata,0.022941
Nrustica,0.022706


# Result for CP15

In [6]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP15.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 2h 18min 52s, sys: 36.5 s, total: 2h 19min 29s
Wall time: 4h 39min 35s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.069624
Ntabacum,0.0499
Nobtusifolia,0.03952
Nquadrivalvis,0.038631
Nattenuata,0.035959
Nrustica,0.034031
Csericea,0.025857
Linflata,0.020501
Aludoviciana,0.014755


# Result for CP16

In [7]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP16.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 50min 58s, sys: 23.2 s, total: 1h 51min 21s
Wall time: 1h 55min 54s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.127258
Ntabacum,0.096299
Nquadrivalvis,0.069444
Nobtusifolia,0.064372
Linflata,0.06435
Nattenuata,0.052732
Aludoviciana,0.048906
Auvaursi,0.03414
Nrustica,0.033465


# Result for CP17

In [8]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP17.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 51min 24s, sys: 27.1 s, total: 1h 51min 51s
Wall time: 1h 57min 9s


Unnamed: 0,score
Ceramic,1.0
Csericea,0.113384
Auvaursi,0.111127
Ntabacum,0.055527
Nglauca,0.03918
Gshallon,0.030344
AmericanSpirit,0.022829
Nquadrivalvis,0.021126
Rglabra,0.015702
Nrustica,0.015384


# Result for CP18

In [9]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP18.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: user 1h 50min 6s, sys: 23.9 s, total: 1h 50min 30s
Wall time: 1h 55min 58s


Unnamed: 0,score
Ceramic,1.0
Csericea,0.110601
Auvaursi,0.097831
AmericanSpirit,0.077407
Ntabacum,0.071229
Nglauca,0.065579
Linflata,0.033601
Nquadrivalvis,0.027689
Nobtusifolia,0.026564
Gshallon,0.023606
