In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns

import math

from string import punctuation

from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import FreqDist

from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

# Read abstracts and project csvs into dataframes

In [2]:
abs_dir = './abstracts/'
proj_dir = './projects/'
project = pd.read_csv(proj_dir + 'RePORTER_PRJ_C_FY2016.csv', 
                   encoding = "ISO-8859-1")
abstract = pd.read_csv(abs_dir + 'RePORTER_PRJABS_C_FY2016.csv',
                      encoding = "ISO-8859-1")

# Clustering NIH data by common keywords

In [3]:
# join abstracts to projects dataframe
df = project.merge(abstract, on='APPLICATION_ID', how='left')

### Clean abstracts (stemming, lowercase, no punctuation, remove stopwords)

In [4]:
# get customized stopwords
to_append = []

def customized_stopwords(to_append):
    """
    to_append: list; what words do you want to exclude from your analysis, in addition to the standard 
    stopwords like 'the', 'and, 'of', and so on? See above to_append variable for examples.
    returns: list; stopwords including to_append list
    """
    stop = stopwords.words('english')
    stop = stop + to_append
    return stop

stop = customized_stopwords(to_append)

# get list of words that are lowercase, with punctuation removed and words stemmed.
def get_wordlist(abstract):
    """
    returns a list of lowercase words from abstract with punctuation and stopwords removed.
    """
    try:
        # make words lowercase
        words = abstract.lower()
        
        # take out all punctuation and split strings into a list of words
        words = (''.join(c for c in words if c not in punctuation)).split(' ')
        
        # remove stopwords
        words = [" ".join([w for w in word.split() if not w in stop]) for word in words]
        
        # stem words using Porter's Stemmer
        stemmed = []
        for word in words:
            try:
                word = PorterStemmer().stem(word)
            except IndexError:
                word = word
            if word != '' and word.isalpha():
                stemmed.append(word)
        words = stemmed

    except AttributeError:
        words = []
    return words

### Filter for neuro-related abstracts.
I defined a project to be neuroscience-related if the abstract mentioned "brain" or "neur*" at least once every 100 non-stopwords.

In [5]:
def neuro_count(row):
    try:
        return (row.ABSTRACT_TEXT.count(' brain') + row.ABSTRACT_TEXT.count('neur'))
    except AttributeError:
        return 0
    
def wordlist_count(row):
    return len(row.wordlist)

def neuro_only(df, word_density=0.01):
    """
    input: dataframe
    word_density: how many neuro related words for every 100 words that are not stopwords in an abstract? Stopwords: the, and, or, not, etc.
    returns: dataframe containing neuro-related projects as defined above, with a column containing cleaned abstract keywords for analysis.
    """
    df['abs_neuro_count'] = df.apply(neuro_count, axis=1)
    df['wordlist'] = df.ABSTRACT_TEXT.apply(get_wordlist)
    df['wordlist_ct'] = df.apply(wordlist_count, axis=1)
    df['rel_neuro_count'] = df.abs_neuro_count / df.wordlist_ct
    return df[df.rel_neuro_count >= word_density]

Implement clean abstracts and filter for neuro projects.

In [6]:
df_100 = df.ix[:100]
df_neuro = neuro_only(df_100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Get feature vectors into a dataframe

In [7]:
def unique_words(df):
    """
    input: df containing 'wordlist' column
    returns: a dictionary containing unique words as keys, where dict[key][0] is total count of word over df
    """
    all_words = list(df.wordlist)
    all_words_list = [item for sublist in all_words for item in sublist]

    count_dict = {}
    
    # Get total word count for each word in dataset
    for word in all_words_list:
        if word not in count_dict.keys():
            count_dict[word] = [1]
        else:
            count_dict[word][0] += 1
        
    # for each project, get word count of each unique word
    for j in count_dict:
        count_dict[j] = count_dict[j] + ([0] * len(all_words))
    
    for i in range(len(all_words)):
        for word in all_words[i]:
            count_dict[word][i+1] += 1
    
    for key in count_dict:
        assert (count_dict[key][0] == sum(count_dict[key][1:])), print('this key,', key, 'is weird.')

    return count_dict

In [8]:
def fv_dict_to_df(df_neuro):
    """
    converts word_count_dict feature vectors into a dataframe with appropriate indices
    """
    word_count_dict = unique_words(df_neuro)
    
    feature_vec = pd.DataFrame(word_count_dict)
    fvec_index = ['Total'] + df_neuro.index.copy().tolist()
    feature_vec.index = fvec_index
    return feature_vec

In [9]:
fv_neuro = fv_dict_to_df(df_neuro)

### Compute tf-idf transformation vector and get td-idf transformation of feature vectors

In [10]:
def get_idf(feature_vector):
    """
    input: dataframe containing feature vectors where df.columns = words, and df.index is NIH funded project, and data contains count of word occurrences
    returns: numpy array for idf part
    """
    fv_no_tot = feature_vector.iloc[1:]
    fv_no_tot = fv_no_tot.replace(0, np.nan)
    
    # get idf = log (number of documents/number of documents with term t in it)
    return  np.log(len(feature_vector.iloc[0]) / (1 + np.array(fv_no_tot.count(axis=0))))

def get_tfidf(feature_vector, df_neuro):
    """
    feature_vector/total_words * idf
    """
    fv_no_tot = feature_vector.iloc[1:]
    
    # get tf
    total_words = df_neuro.ix[list(fv_no_tot.index), :].wordlist_ct
    tf = fv_no_tot.div(total_words, axis=0)
    
    # get idf
    idf = get_idf(feature_vector)
    
    # return td-idf
    return tf * idf

In [11]:
fv_neuro_tfidf = get_tfidf(fv_neuro, df_neuro)

### PCA

### Cluster

### Time series?

### Other analyses (grant funding and number, etc.)

In [12]:
%whos

Variable               Type                    Data/Info
--------------------------------------------------------
FreqDist               type                    <class 'nltk.probability.FreqDist'>
PorterStemmer          type                    <class 'nltk.stem.porter.PorterStemmer'>
TfidfVectorizer        type                    <class 'sklearn.feature_e<...>on.text.TfidfVectorizer'>
abs_dir                str                     ./abstracts/
abstract               DataFrame                      APPLICATION_ID    <...>n[70512 rows x 2 columns]
customized_stopwords   function                <function customized_stopwords at 0x115c9c2f0>
df                     DataFrame                      APPLICATION_ID ACT<...>[71827 rows x 46 columns]
df_100                 DataFrame                    APPLICATION_ID ACTIV<...>\n[101 rows x 50 columns]
df_neuro               DataFrame                   APPLICATION_ID ACTIVI<...>n\n[18 rows x 50 columns]
fv_dict_to_df          function               