In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cosine



In [2]:
def cosine_similarity(data):
    #given a processed data, compute cosine similarity
    final_sample=data
    transformer = TfidfTransformer(use_idf = False )
    df_new=final_sample.values.tolist()
    tfidf_sparse = transformer.fit_transform(df_new)
    matrix_result=tfidf_sparse.todense()
    tf_idf=pd.DataFrame(matrix_result)
    tf_idf.columns=final_sample.columns
    columns=tf_idf.columns
    d={}
    for sample in columns:
        d[sample]=[1-cosine(tf_idf["Ceramic"], tf_idf[sample])]
    similarity=pd.DataFrame.from_dict(d)
    similarity.index=['score']
    similarity.sort_values(by='score', ascending=False, axis=1, inplace=True)
    return similarity

In [3]:
def tf_idf_score(df):
    df_removed_0=df[df["Blank_pipe-APW.raw filtered Peak height"]==0]
    df_new=df_removed_0.drop(["row ID", "row m/z","row retention time"], axis=1)
    final_df=pd.DataFrame()
    for column in df_new.columns:
        final_df[column.split('_')[0]]=df_new[column]
    final_df.drop('Blank',axis=1, inplace=True)
    result=cosine_similarity(final_df)
    return result 
    

# Result of CP1

In [4]:
cp1=pd.read_csv("Blind_Pipes-APW-CP1.csv")

tf_idf_score(cp1).transpose()

Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.034838
Nglauca,0.034759
Nquadrivalvis,0.023162
Nobtusifolia,0.019656
Nattenuata,0.015899
Linflata,0.011595
Auvaursi,0.011287
Rglabra,0.011249
Csericea,0.010792


# Result of CP2

In [5]:
cp2=pd.read_csv("Blind_Pipes-APW-CP2.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.058457
Nglauca,0.030858
Nquadrivalvis,0.025944
Nattenuata,0.019707
Nobtusifolia,0.016698
AmericanSpirit,0.015705
Csericea,0.015379
Linflata,0.010811
Nrustica,0.009876


# Result of CP3

In [6]:
cp2=pd.read_csv("Blind_Pipes-APW-CP3.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nglauca,0.03917
Ntabacum,0.034777
AmericanSpirit,0.029711
Nquadrivalvis,0.019263
Nattenuata,0.015068
Auvaursi,0.012582
Csericea,0.012557
Linflata,0.012138
Nobtusifolia,0.010708


# Result of CP4

In [7]:
cp2=pd.read_csv("Blind_Pipes-APW-CP4.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.037002
Nglauca,0.034919
Nquadrivalvis,0.029661
Nattenuata,0.025719
Nobtusifolia,0.020972
Csericea,0.015294
AmericanSpirit,0.014275
Nrustica,0.014082
Linflata,0.011996


## Results of CP5

In [8]:
cp2=pd.read_csv("Blind_Pipes-APW-CP5.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nglauca,0.0674
Ntabacum,0.041231
Nobtusifolia,0.040346
AmericanSpirit,0.035013
Nquadrivalvis,0.030859
Nrustica,0.023908
Nattenuata,0.018521
Csericea,0.017588
Rglabra,0.017544


## Results of CP6

In [9]:
cp2=pd.read_csv("Blind_Pipes-APW-CP6.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nglauca,0.056957
Ntabacum,0.040408
AmericanSpirit,0.023524
Linflata,0.022611
Rglabra,0.021404
Nquadrivalvis,0.021284
Gshallon,0.021157
Nobtusifolia,0.019407
Csericea,0.017982


## Results of CP7

In [10]:
cp2=pd.read_csv("Blind_Pipes-APW-CP7.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.032873
Nattenuata,0.021041
Nglauca,0.016383
AmericanSpirit,0.013368
Auvaursi,0.010518
Csericea,0.009732
Nquadrivalvis,0.008989
Nrustica,0.008911
Linflata,0.005926


## Results of CP8

In [11]:
cp2=pd.read_csv("Blind_Pipes-APW-CP8.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Csericea,0.094102
Ntabacum,0.046877
Nglauca,0.036211
Auvaursi,0.026738
AmericanSpirit,0.024945
Nquadrivalvis,0.016443
Nattenuata,0.014026
Rglabra,0.013222
Gshallon,0.012689


## Results of CP9

In [12]:
cp2=pd.read_csv("Blind_Pipes-APW-CP9.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nattenuata,0.03224
Ntabacum,0.018441
Nglauca,0.016969
AmericanSpirit,0.01599
Nquadrivalvis,0.014012
Auvaursi,0.009605
Nobtusifolia,0.009186
Nrustica,0.009181
Linflata,0.008124


## Results of CP10

In [13]:
cp2=pd.read_csv("Blind_Pipes-APW-CP10.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.030054
Nattenuata,0.022419
Nglauca,0.022118
AmericanSpirit,0.016941
Nobtusifolia,0.014963
Auvaursi,0.014517
Nquadrivalvis,0.011558
Csericea,0.006945
Nrustica,0.006637


## Results of CP11

In [14]:
cp2=pd.read_csv("Blind_Pipes-APW-CP11.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nquadrivalvis,0.107178
Nattenuata,0.095134
Ntabacum,0.051717
Nglauca,0.050729
Nrustica,0.032249
Nobtusifolia,0.026754
Aludoviciana,0.017693
AmericanSpirit,0.009943
Auvaursi,0.008962


## Results of CP12

In [15]:
cp2=pd.read_csv("Blind_Pipes-APW-CP12.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nobtusifolia,0.111844
Nglauca,0.090764
Nattenuata,0.079312
Nrustica,0.068011
Ntabacum,0.059156
Nquadrivalvis,0.058784
Aludoviciana,0.054109
Linflata,0.025028
Vthapsus,0.025008


## Results of CP13

In [16]:
cp2=pd.read_csv("Blind_Pipes-APW-CP13.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Ntabacum,0.088103
Nattenuata,0.065074
Nglauca,0.057685
Nobtusifolia,0.045247
Aludoviciana,0.037107
Nrustica,0.03342
Nquadrivalvis,0.032345
Linflata,0.020622
Vthapsus,0.019698


## Results of CP14

In [17]:
cp2=pd.read_csv("Blind_Pipes-APW-CP14.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nglauca,0.080963
Nattenuata,0.05177
Ntabacum,0.033147
Nobtusifolia,0.025509
Nquadrivalvis,0.021788
Nrustica,0.016886
Aludoviciana,0.012685
AmericanSpirit,0.009489
Linflata,0.008905


## Results of CP15

In [18]:
cp2=pd.read_csv("Blind_Pipes-APW-CP15.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nglauca,0.059489
Nobtusifolia,0.050925
Nattenuata,0.047037
Nrustica,0.042922
Ntabacum,0.041993
Aludoviciana,0.034869
Nquadrivalvis,0.027838
Linflata,0.022757
Vthapsus,0.017532


## Results of CP16

In [19]:
cp2=pd.read_csv("Blind_Pipes-APW-CP16.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Nglauca,0.074779
Ntabacum,0.060409
Nattenuata,0.0571
Linflata,0.047818
Nobtusifolia,0.047015
Nrustica,0.040874
Aludoviciana,0.036889
Nquadrivalvis,0.028694
Vthapsus,0.026021


## Results of CP17

In [20]:
cp2=pd.read_csv("Blind_Pipes-APW-CP17.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Auvaursi,0.073917
Csericea,0.064461
Gshallon,0.034688
Ntabacum,0.030824
Rglabra,0.023389
Nglauca,0.018475
Nattenuata,0.016751
AmericanSpirit,0.012886
Nrustica,0.011873


## Results of CP18

In [21]:
cp2=pd.read_csv("Blind_Pipes-APW-CP18.csv")
tf_idf_score(cp2).transpose()

Unnamed: 0,score
Ceramic,1.0
Csericea,0.072717
Auvaursi,0.059242
Ntabacum,0.044327
AmericanSpirit,0.035335
Nglauca,0.03424
Nattenuata,0.031331
Gshallon,0.025639
Rglabra,0.022886
Linflata,0.019009
