In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine



In [2]:
def data_prep(df):
    df_removed_0=df[df["Blank_pipe-APW.raw filtered Peak height"]==0]
    df_new=df_removed_0.drop(["row ID", "row m/z","row retention time",
                              'Gshallon_pipe-APW.raw filtered Peak height'], axis=1)

    final_df=pd.DataFrame()
    for column in df_new.columns:
        final_df[column.split('_')[0]]=df_new[column]
    final_df.drop('Blank',axis=1, inplace=True)
    final_df.index = range(len(final_df.index))
    return final_df 
    

In [3]:
def pair_count(w1, w2, data):
    columns = data.columns 
    count = 0 
    for column in columns:
        if data[column][w1] > 0 and data[column][w2]>0:
            count +=1 
    return count         

def compute_pmi(w1, w2, data):
    count_1 = pair_count(w1,w1,data)
    count_2 = pair_count(w2,w2,data)
    count_3 = pair_count(w1, w2,data)
    if count_2 == 0 or count_1 ==0: 
        return 0 
    else:
        pmi = count_3/(count_2*count_1)
        return pmi 
    
def all_pairs_pmi(data):
    data_index = data.index
    result = [[0]* len(data_index) for j in data_index]
    for w1 in data_index:
        for w2 in data_index:
            if w1 >= w2:
                result[w1][w2] = compute_pmi(w1, w2, data)   
            else:
                result[w1][w2] = result[w2][w1]
    return result

def all_pairs_pmi_new(data):
    data_index = data.index
    result = {}
    for w1 in data_index:
        d = {}
        for w2 in data_index:
            d[w2] = compute_pmi(w1, w2, data)           
            result[w1] = d
    return result

def compute_vector(column, data):
    corpus = data.index
    v = []
    for w1 in corpus:
        all_pmi = [compute_pmi(w1, w2, data) for w2 in corpus] 
        average_pmi = sum(all_pmi)/len(all_pmi)
        v.append(average_pmi)
    return v    


def compute_vectors(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
            if data[column][w1] > 0:
                average_pmi = sum(all_pmi)/len(all_pmi)
            else:
                average_pmi = 0 
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_vectors_fast(data):
    corpus = data.index
    #all_pairs = all_pairs_pmi(data)
    all_pairs = all_pairs_pmi(data)
    columns = data.columns
    vector_for_samples ={}
    for column in columns:
        v = []
        for w1 in corpus:
            if data[column][w1] == 0:
                average_pmi = 0 
            else: 
                all_pmi = [all_pairs[w1][w2] for w2 in corpus] 
                average_pmi = sum(all_pmi)/len(all_pmi)
            v.append(average_pmi)
            vector_for_samples[column] = v        
    return vector_for_samples 

def compute_consine_similarity(data):
    vector_for_samples = compute_vectors(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity

def compute_consine_similarity_fast(data):
    vector_for_samples = compute_vectors_fast(data)
    sample_names  = data.columns 
    d = {}
    for sample in sample_names:
        d[sample] = [1 -cosine(vector_for_samples[sample], vector_for_samples['Ceramic'])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)    
    return similarity


# Result of CP1

In [4]:
cp1=pd.read_csv("Blind_Pipes-APW-CP1.csv")
data =data_prep(cp1)


In [5]:
%%time
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 20min 39s
Wall time: 1h 4min 27s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.115453
Ntabacum,0.101201
Nquadrivalvis,0.066381
Csericea,0.042478
Nobtusifolia,0.042366
AmericanSpirit,0.036323
Nrustica,0.028719
Nattenuata,0.027153
Linflata,0.024028


# Result of CP2


In [6]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP2.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 17min 14s
Wall time: 57min 27s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.1173
Ntabacum,0.114076
Nquadrivalvis,0.069422
Csericea,0.051835
Nobtusifolia,0.050707
AmericanSpirit,0.042603
Linflata,0.035246
Nattenuata,0.030235
Nrustica,0.023838


# Result for CP3

In [7]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP3.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 16min 14s
Wall time: 1h 12min


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.093054
AmericanSpirit,0.062854
Ntabacum,0.061254
Nquadrivalvis,0.035766
Csericea,0.031454
Auvaursi,0.029596
Linflata,0.024466
Nobtusifolia,0.019443
Aludoviciana,0.015294


# Result for CP4

In [8]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP4.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 17min 2s
Wall time: 57min 41s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.112336
Ntabacum,0.10602
Nquadrivalvis,0.065158
Csericea,0.054744
AmericanSpirit,0.052223
Nobtusifolia,0.05037
Linflata,0.039405
Nattenuata,0.027334
Nrustica,0.018671


# Result for CP5

In [9]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP5.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 19min 57s
Wall time: 57min 57s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.15128
Ntabacum,0.132654
Nquadrivalvis,0.089034
AmericanSpirit,0.064265
Nobtusifolia,0.052842
Csericea,0.048403
Nattenuata,0.031727
Nrustica,0.031708
Linflata,0.025823


# Result for CP6

In [10]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP6.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 18min 12s
Wall time: 57min 26s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.138477
Ntabacum,0.102701
Linflata,0.067544
Csericea,0.062974
Nquadrivalvis,0.054616
AmericanSpirit,0.047117
Aludoviciana,0.038787
Nobtusifolia,0.037977
Nrustica,0.032867


# Result for CP7

In [11]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP7.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 20min 11s
Wall time: 57min 53s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.072509
Ntabacum,0.069928
Csericea,0.04373
Nquadrivalvis,0.038769
AmericanSpirit,0.036626
Nattenuata,0.027118
Auvaursi,0.021053
Linflata,0.019756
Nobtusifolia,0.019108


# Result for CP8

In [12]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP8.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 13min 47s
Wall time: 1h


Unnamed: 0,score
Ceramic,1.0
Csericea,0.134674
Nglauca,0.083832
Ntabacum,0.082937
Auvaursi,0.061383
AmericanSpirit,0.043824
Nquadrivalvis,0.034821
Nobtusifolia,0.025641
Linflata,0.023426
Nattenuata,0.020389


# Result for CP9

In [13]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP9.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 12min 12s
Wall time: 1h 9min 50s


Unnamed: 0,score
Ceramic,1.0
Nglauca,0.088804
Ntabacum,0.07857
Nquadrivalvis,0.051769
Nattenuata,0.049452
AmericanSpirit,0.035947
Nrustica,0.029848
Nobtusifolia,0.027635
Csericea,0.021442
Linflata,0.017362


# Result for CP10

In [14]:
%%time 
cp=pd.read_csv("Blind_Pipes-APW-CP10.csv")
data =data_prep(cp)
similarity= compute_consine_similarity_fast(data)
similarity.transpose()

CPU times: total: 18min 51s
Wall time: 57min 58s


Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.094536
Nglauca,0.073392
Nobtusifolia,0.058098
Nquadrivalvis,0.057732
Auvaursi,0.034773
Nattenuata,0.032168
AmericanSpirit,0.030725
Csericea,0.021384
Aludoviciana,0.019915
