# Unimportant settings

In [3]:
# This setting allows the notebook to show all 
# outputs instead of only the last one. It's just a QoL thing.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import packages

This is where we will import our **Python tools** that will help us tackle this problem

In [4]:
import pandas as pd # to read and analyse data
from sklearn.feature_extraction.text import CountVectorizer # to help us count words in text
import matplotlib.pyplot as plt # to help us plot!
import numpy as np # to help us with some linear algebra stuff
from sklearn.metrics.pairwise import cosine_similarity # to compute cosine between two vectors!

# Import the data

In [5]:
clinical_trials_data_path = 'https://raw.githubusercontent.com/vohcolab/TREC-Clinical-Workshop/main/data/sample_collection.csv'
patients_data_path = 'https://raw.githubusercontent.com/vohcolab/TREC-Clinical-Workshop/main/data/patients_sample.csv'


clinical_trials_data = pd.read_csv(clinical_trials_data_path,index_col=0)
patients_data = pd.read_csv(patients_data_path,index_col=0)

# Extracting more meaning out of text (documents, actually!)

Let's count how many times each word appears in a patient description!

What we are doing here is representing each document as a **vector**!

<img src="https://memegenerator.net/img/instances/43555096.jpg">

Take the following picture:

<img src="http://3.bp.blogspot.com/_tOOi3R89e74/TUeyueig7ZI/AAAAAAAAAJQ/QHL-VLEWook/s1600/vector_space.png">

This allows us to matheatically compare documents!

<img src="https://2.bp.blogspot.com/-saTZSoc5RAA/WfghS_CMvJI/AAAAAAAAGBg/PcZvT0QNZCcPJq8fAv2v_cSwrnagdm9RgCK4BGAYYCw/s1600/cosine_similarity.PNG">

# Let's apply this!

Unnamed: 0,000,0000,0001,0006,001,0014,002,00547248,007,0099,...,ʻlaterʼ,ʻnowʼ,αvβ5,β1,κb,λs,μg,μl,τhe,ﬂow
20141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201411,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(3221, 17027)

Unnamed: 0,000,0000,0001,0006,001,0014,002,00547248,007,0099,...,ʻlaterʼ,ʻnowʼ,αvβ5,β1,κb,λs,μg,μl,τhe,ﬂow
20141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
201411,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,000,0000,0001,0006,001,0014,002,00547248,007,0099,...,ʻlaterʼ,ʻnowʼ,αvβ5,β1,κb,λs,μg,μl,τhe,ﬂow
NCT00000408,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT00000492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NCT00000501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Now let's start matching!

id,NCT00000408,NCT00000492,NCT00000501,NCT00001853,NCT00004727,NCT00005127,NCT00005485,NCT00005757,NCT00077948,NCT00108381,...,NCT01663402,NCT01766830,NCT01891084,NCT01921842,NCT02164513,NCT02459171,NCT02459327,NCT02498964,NCT02612896,NCT02633319
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20141,0.446483,0.434488,0.47216,0.455604,0.446577,0.255207,0.474056,0.385708,0.34002,0.140257,...,0.372836,0.488586,0.391392,0.331964,0.378725,0.401752,0.467547,0.445988,0.489418,0.439264
201410,0.296592,0.379519,0.40656,0.331801,0.358902,0.184568,0.396485,0.387298,0.210445,0.0,...,0.28229,0.440942,0.372678,0.291667,0.348596,0.304866,0.364308,0.393039,0.427447,0.366414
201411,0.34006,0.195748,0.262583,0.296839,0.218611,0.245285,0.276003,0.235907,0.229253,0.029488,...,0.202289,0.252547,0.238352,0.266485,0.228922,0.259202,0.289059,0.218456,0.31337,0.294375


(51, 3170)

<img src="https://memegenerator.net/img/instances/66482719.jpg">

> If the angle between a clinical trial vector and a patient vector is very small, what should the cosine be?

## How well does this system perform?

<img src="https://i.imgflip.com/5dtlpc.jpg">

In [4]:
qrels_path = 'https://raw.githubusercontent.com/vohcolab/TREC-Clinical-Workshop/main/data/qrels_sample.csv'

qrels = pd.read_csv(qrels_path)
qrels

Unnamed: 0,qid,docid,rel
0,20141,NCT00000408,0
1,20141,NCT00000492,1
2,20141,NCT00000501,0
3,20141,NCT00001853,0
4,20141,NCT00004727,0
...,...,...,...
3341,20159,NCT02459171,0
3342,20159,NCT02459327,0
3343,20159,NCT02498964,0
3344,20159,NCT02612896,0
