# Prepare Neurosynth data

This notebook does the following:
1. Query and scrape results of neurosynth decoding
2. Create word clouds

In [57]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

In [None]:
states = [state for state in range(0, 12)]
freq_bands = ['alpha', 'beta', 'deltatheta', 'wideband']

### Scrape neurosynth results

In [58]:
# Functions for querying and scraping neurosynth results 
def show_100(driver):
    select_element = Select(driver.find_element(By.NAME, 'decoding_results_table_length'))
    select_element.select_by_value("100")

def body_to_df(driver):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    table = soup.find('table', {'id': 'decoding_results_table'})

    data = []
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) > 1:
            data.append({
                'term': cols[1].text.strip(),
                'corr': float(cols[2].text.strip())
            })
    df = pd.DataFrame(data, columns=['term', 'corr'])

    return df

def next_page(driver):
    button = driver.find_element(By.ID, "decoding_results_table_next")
    button.click()

def is_last_page(driver):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    info_div = soup.find('div', {'id': 'decoding_results_table_info'})
    info_text = info_div.text.strip()
    
    return info_text == "Showing 1,301 to 1,307 of 1,307 entries"

In [60]:
for state in range(0, 12):
    for freq in freq_bands:
        # Setup
        url = f'https://neurosynth.org/decode/?url=http://neurovault.org/media/images/MWFNMTNK/state{state}_{freq}.nii.gz'
        driver = webdriver.Chrome()
        driver.get(url)
        show_100(driver)

        # Scrape first page
        df = body_to_df(driver)
        next_page(driver)

        while not is_last_page(driver):
            temp = body_to_df(driver)
            df = pd.concat([df, temp], ignore_index=True)
            next_page(driver)
        
        temp = body_to_df(driver)
        df = pd.concat([df, temp], ignore_index=True)
        driver.quit()

        df['state'] = state
        df['freq'] = freq
        
        if state == states[0] and freq == freq_bands[0]:
            full_data = df.copy()
        else:
            full_data = pd.concat([full_data, df], ignore_index=True)

In [61]:
full_data

Unnamed: 0,term,corr,state,freq
0,temporal,0.270,0,alpha
1,temporal gyrus,0.267,0,alpha
2,superior temporal,0.249,0,alpha
3,comprehension,0.235,0,alpha
4,listening,0.231,0,alpha
...,...,...,...,...
62731,anterior cingulate,-0.066,11,wideband
62732,prefrontal cortex,-0.066,11,wideband
62733,primary,-0.067,11,wideband
62734,cingulate,-0.069,11,wideband


In [62]:
full_data.to_csv('streamlit_data/neurosynth.csv', index=False)

### Create word clouds

In [66]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

freq_colors = {
    'alpha': 'Blues',
    'beta': 'Oranges',
    'deltatheta': 'Reds',
    'wideband': 'Purples',
}

for background in ['black', 'white']:
    for state in states:
        for freq in freq_bands:
            data_dict = full_data.query(f"freq == '{freq}' & state == {state}").set_index('term')['corr'].to_dict()

            wordcloud = WordCloud(
                width=900,
                height=1600,
                random_state=21,
                max_font_size=110,
                colormap=freq_colors[freq],
                prefer_horizontal=0.9,
                max_words=80,
                background_color=background,
                font_path='C:/Users/tobia/AppData/Local/Microsoft/Windows/Fonts/Helvetica.ttf',
                ).generate_from_frequencies(data_dict)

            wordcloud.to_file(f"streamlit_data/neurosynth/{background}/state{state}_{freq}.png")