In [27]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
import re
from collections import Counter
from ipywidgets import interact, interactive, fixed, interact_manual, interactive_output
from IPython.display import display, display_html
from itertools import chain,cycle
from nltk.util import ngrams



In [28]:
filename = 'all_text.csv'
all_text = pd.read_csv(filename)
all_text.token = all_text.token.apply(lambda x: x.strip("[]").replace("'","").split(", "))


In [29]:
def display_side_by_side(*args, titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [30]:
def word_count(files = [0], top_x = 10):
    titles = []
    charts = []
    for ind in files:
        c = Counter(all_text.token[ind]) 
        wc = pd.DataFrame(c.items(), columns=['word','count'])
        wc_top = wc.sort_values(by='count',ascending=False).head(top_x)
        titles.append(all_text.filename[ind])
        charts.append(wc_top)

    display_side_by_side(*charts, titles=titles)

In [31]:
def listOfTuples(l1, l2):
    return list(map(lambda x, y:(x,y), l1, l2))
filename_index = listOfTuples(all_text.filename, all_text.index.tolist())

w = widgets.SelectMultiple(
    options=filename_index,
    value=[0,1],
    description='Document',
    disabled=False
)

In [32]:
a = widgets.IntSlider(value=10)
ui = widgets.HBox([w, a])
out = widgets.interactive_output(word_count, {'files': w, 'top_x': a})


In [33]:
print("Hold down the ctl button and click to select multiple documents from the list.")
display(ui, out)

Hold down the ctl button and click to select multiple documents from the list.


HBox(children=(SelectMultiple(description='Document', index=(0, 1), options=(('C1', 0), ('C2', 1), ('C3', 2), …

Output(outputs=({'output_type': 'display_data', 'data': {'text/html': '<th style="text-align:center"><td style…

In [34]:
file_index = [1,2,3]

In [35]:
def n_gram_word_count(files = [0], top_x = 10, x_grams = 2):
    titles = []
    charts = []
    for ind in file_index:
        x_grams_groups = ngrams(all_text.token[ind], x_grams)
        c = Counter(x_grams_groups) 
        gram_count = pd.DataFrame(c.items(), columns=['word','count'])
        wc_top = gram_count.sort_values(by='count',ascending=False).head(top_x)
        titles.append(all_text.filename[ind])
        charts.append(wc_top)
        
    display_side_by_side(*charts, titles=titles)

In [38]:
top_slider = widgets.IntSlider(description='Top # of words:', value=3, min=1, max=25)
gram_slider = widgets.IntSlider(description='Group count:', value=2, min=1, max=5)
ui_grams = widgets.HBox([w, top_slider, gram_slider])
out_ngrams = widgets.interactive_output(n_gram_word_count, {'files': w, 'top_x': top_slider, 'x_grams': gram_slider})

In [39]:
display(ui_grams, out_ngrams)

HBox(children=(SelectMultiple(description='Document', index=(2, 3, 4), options=(('C1', 0), ('C2', 1), ('C3', 2…

Output(outputs=({'output_type': 'display_data', 'data': {'text/html': '<th style="text-align:center"><td style…

In [9]:
def find_word_with_context(document=all_text.filename, search="pain", char_bef=50, char_aft=50):
    if search == "":
        return "Enter search word above."
    doc = all_text.index[all_text.filename == document]
    words = all_text.text[doc].reset_index().text[0]
    res = []
    for m in re.finditer(search, words):
        word_w_context = words[m.start()-char_bef: m.end()+char_aft]
        if word_w_context != "":
            res.append(word_w_context.replace("\n", ""))
    if len(res) == 0:
        return "No results could be found."
    return res

In [10]:
interact(find_word_with_context, search_word="pelvic");

interactive(children=(Dropdown(description='document', options=('C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8'…