# Exploring 1st Iteration Data

### Setup

In [1]:
# General Imports
import os
import nltk
from itertools import chain

In [2]:
# Add paths for custom code & data
#  This is a bit of a hack to add the directories for code & data, in a general way
#  This approach assumes the everything is located up one directory
base_path = os.path.dirname(os.getcwd())
dat_path = os.path.join(base_path, 'Data')

In [3]:
# Add custom code location to path
#import sys
#sys.path.append(base_path)

# Import custom code
from consc.data import load_folder

### Functions

These are some quick functions to work with the data

In [4]:
def get_all_words(dat, field):
    """Extract all (tokenized) words across a collection of documents."""
    
    all_words = [getattr(doc, field) for doc in dat]
    all_words = list(chain.from_iterable(all_words))
    
    return all_words

### Loading

In [5]:
# Set term to analyze
current_term = 'autism'

# Load the data
paper_dat = load_folder('Papers', current_term, dat_path)
press_dat = load_folder('PRs', current_term, dat_path)

#### Sanity Check Loaded Data

In [6]:
# Check data sizes
print(len(paper_dat))
print(len(press_dat))

498
333


#### Check examples of data

In [7]:
# Paper
for ke, va in paper_dat[0].__dict__().items():
    print(ke, '\t:\t', va)

Huntington Disease (HD) is an autosomal dominant disorder characterized by motor, cognitive and behavioral features caused by a CAG expansion in the HTT gene beyond 35 repeats. The juvenile form (JHD) may begin before the age of 20years and is associated with expanded alleles as long as 60 or more CAG repeats. In this study, induced pluripotent stem cells were generated from skin fibroblasts of a 8-year-old child carrying a large size mutation of 84 CAG repeats in the HTT gene. HD appeared at age 3 with mixed psychiatric (i.e. autistic spectrum disorder) and motor (i.e. dystonia) manifestations.


In [24]:
# Press Release
for ke, va in press_dat[25].__dict__().items():
    print(ke, '\t:\t', va)

url 	:	 http://www.eurekalert.org/pub_releases/2017-01/e-pim012417.php
title 	:	 Prenatal infection may alter brain development via epigenetic changes | EurekAlert! Science News
text 	:	 Experimental Biology 2017April 22 - 26, 2017Chicago, IL2018 AAAS Annual Meeting
February 15 - 19, 2018Austin, TXPhiladelphia, PA, Jan. 24, 2017 - Maternal infection during pregnancy increases the risk for psychiatric disorders in the child, but the path between the two is something of a mystery. In a study published in Biological Psychiatry, senior author Professor Urs Meyer of the University of Zurich-Vetsuisse in Zurich, Switzerland and colleagues use a mouse model to show that activation of the mother's immune system may cause long-term alterations in the programming of the offspring's genome, known as epigenetic modifications, which lead to behavioral abnormalities in adulthood."This study suggests that immunologic activation may be the connection between maternal infection to epigenetic changes th

### Pre-Processing

In [None]:
# Collapse across all words in a given data type
all_paper_words = get_all_words(paper_dat, 'words')
all_press_words = get_all_words(press_dat, 'words')

### NLTK 

In [None]:
# Create and plot the frequency distribution of words - papers
paper_fd = nltk.FreqDist(all_paper_words)
paper_fd.plot(25)

In [None]:
# Create and plot the frequency distribution of words - PRs
press_fd = nltk.FreqDist(all_press_words)
press_fd.plot(25)

### Inspecting Objects

In [12]:
import inspect
inspect.getmembers(press_dat[0])