### Imports

In [None]:
import pandas as pd
import numpy as np

from process_text import text_to_words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2

### 1. Read files

In [None]:
notes = pd.read_csv("data/processed/notes.csv", sep=";")

### 2. Exploratory analysis
First, we convert the notes to a bag-of-words dataframe containing the 1000 most common terms, including bi-grams.

In [None]:
# First preprocess the texts by converting to words
notes['words'] = notes['text'].apply(lambda x : ' '.join(text_to_words(x)))

# We then vectorize texts by counting individual terms and pairs of terms
count_vect = CountVectorizer(ngram_range=(1,2), 
                             max_features=1000, 
                             binary=True
                            )
term_counts = count_vect.fit_transform(notes['words'])

# Transform output into a dataframe, concatenate outcome
terms = pd.DataFrame(term_counts.toarray(), 
                     columns=[x for x in count_vect.get_feature_names()])

terms = pd.concat([notes['outcome'], terms], axis=1)

In many resamples, we select the highest predictors of outcome according to $\chi^{2}$, and pool this 10% over resamples. 

In [None]:
# Determine best features based on Chi^2
best_terms = []

num_resamples = 100
select_n_best_terms = 10

for i in range(num_resamples):

    # Sample with replacement
    terms_sample = terms.sample(frac=1, replace=True, random_state=i)
    
    # Select best predictors based on chi2 value
    kbest_selector = SelectKBest(chi2, k=select_n_best_terms)
    kbest_selector.fit(terms_sample.drop(['outcome'], axis=1), 
                       terms_sample['outcome'])
    
    # Append result, without regarding order
    best_terms.append(terms_sample.drop(['outcome'], axis=1).columns[kbest_selector.get_support()])
    
# Flatten list of lists
best_terms = [item for sublist in best_terms for item in sublist]

Compute Spearman correlation for best_n_terms

In [None]:
# Compute fractions
fractions = (pd.Series(best_terms).value_counts()/num_resamples)

In [None]:
# Compute spearman correlations of select_n_best_terms most selected predictors
top_n_terms = terms[list(fractions[:select_n_best_terms].index) + ['outcome']]
correlations = top_n_terms.corr(method="spearman")[['outcome']].reset_index()

We compute the $\chi^{2}$ value for each term in the dataset, store the result in a new dataframe and apply a Holm-Bonferroni correction

In [None]:
# Refit chi2 on entire dataset
kbest_selector = SelectKBest(chi2, k=select_n_best_terms)
kbest_selector.fit(terms.drop(['outcome'], 1), terms['outcome'])

# Create dataframe with results for top n terms
results = pd.DataFrame({'term' : terms.drop(['outcome'], 1).columns, 
                        'chisquared_score' : kbest_selector.scores_, 
                        'chisquared_pvalue' : kbest_selector.pvalues_})

# Apply Holm-Bonferroni correction
results = results.sort_values(['chisquared_score'], ascending=False)
results['rank'] = np.arange(len(results)) + 1
results['hb_factor'] = (len(results) + 1 - results['rank'])
results['p_corr'] = results['chisquared_pvalue'] * results['hb_factor']

# Determine significance
alpha = 0.01
results['significant'] = results['p_corr'] < alpha

Add the generalization ratio and the Spearman correlations to the dataframe

In [None]:
# Add fraction of selected in bootstrapped feature selection
results['generalization_ratio'] = results['term'].apply(lambda x : fractions[x] if x in fractions.keys() else 0)

# Add correlations
results = results.merge(correlations, left_on='term', right_on='index', how='left')

print("A total number of {} variables are significant".format(results['significant'].sum()))

In [None]:
# Show results
results.sort_values('generalization_ratio', ascending=False).head(20)