# Quantitative Text Analysis

After importing packages and text data, and defining pre-processing, this code does the following:

1. Analyze all answers from all datasheets together
2. Analyze answers from each individual datasheet
3. Analyze answers on a question-by-question basis

For each of these categories, we explore:
- Term frequency (which words occur most often)
- Collocations (which pairs and triplets of words co-occur most often)
- Lexical diversity (ratio of unique words to total words)

We also conduct topic modeling and calculate TF-IDF cosine similarity to compare across individual datasheets (section 2) and across questions (section 3).

## Load packages and define text processing

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
##import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import FreqDist
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

!pip install gensim
import gensim
from gensim import corpora, models
from gensim.matutils import cossim
from gensim.models import CoherenceModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

!pip install pyLDAvis
import pyLDAvis

In [None]:
import warnings
warnings.filterwarnings('ignore')

##load nltk datasets
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')

In [None]:
##load csv of text data
datasheetsContent = pd.read_excel("datasheetscontent.xlsx")
####print(datasheetsContent)

In [None]:
##text preprocessing

##load resources
lemmatizer = WordNetLemmatizer()
na_words = ["nan", "NA", "N/A", "NAN", "http", "HTTP", ".com", "c"]
na_words = set(na_words)
stop_words = set(stopwords.words('english'))

##define processing function
def process(text):
    ##make text lowercase
    text = text.lower()
    ##tokenize text
    tokens = word_tokenize(text)
    ##keep alphabetic tokens only
    tokens = [word for word in tokens if word.isalpha()]
    ##remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    ##remove na words
    tokens = [word for word in tokens if word not in na_words]
    ##filter out urls
    tokens = [token for token in tokens if not token.startswith('http://') and not token.startswith('https://')]
    ##lemmatize text
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens


# 1. Analyze all answers from all datasheets 

In [None]:
##concatenate all answers from all datasheets into a single string
allAnswersString = " ".join(datasheetsContent['answer'].astype(str))
####print(allAnswersString)

In [None]:
##process text data

lemmatizedWords = process(allAnswersString)
####print(lemmatizedWords)

####allAnswersString = " ".join(lemmatizedWords)
####print(allAnswersString)

### *Word Frequency*

In [None]:
##analyze word frequency

frequencyDistribution = FreqDist(lemmatizedWords)
##print 10 most common words
print(frequencyDistribution.most_common(20))
##visualize word frequencies
fig, ax = plt.subplots(figsize=(12,9), layout="constrained")
plt.title("Term Frequency", fontsize=28, pad = 30)
frequencyDistribution.plot(20, cumulative=False)  
plt.xlabel("Term", fontsize = 18)
plt.ylabel("Number of Occurrences", fontsize = 18)
for line in ax.get_lines():
    line.set_linewidth(5)
plt.setp(
    ax.get_xticklabels(),
    rotation=45,
    ha="right",
    rotation_mode="anchor"
)
ax.tick_params(axis='x', labelsize = 14)
ax.tick_params(axis='y', labelsize=14)

plt.savefig('frequencyImage.png', dpi=300)
plt.show()



### *Collocations*

In [None]:
##find collocations

##bigrams (pairs of words that occur together often)
bigram_finder = BigramCollocationFinder.from_words(lemmatizedWords)
###only show those that occur more than thrice
bigram_finder.apply_freq_filter(3)
###rank by PMI (Pointwise Mutual Information)
bigram_collocations = bigram_finder.nbest(BigramAssocMeasures.pmi, 10)

print("Top bigram collocations with frequency:")
for bigram in bigram_collocations:
    freq = bigram_finder.ngram_fd[bigram]
    print(f"{bigram}: {freq}")
    
##trigrams (triplets of words that occur together often)
trigram_finder = TrigramCollocationFinder.from_words(lemmatizedWords)
###only show those that occur more than thrice
trigram_finder.apply_freq_filter(3)
###rank by PMI
trigram_collocations = trigram_finder.nbest(TrigramAssocMeasures.pmi, 10)

print("\nTop trigram collocations with frequency:")
for trigram in trigram_collocations:
    freq = trigram_finder.ngram_fd[trigram]
    print(f"{trigram}: {freq}")


### *Lexical Diversity*

In [None]:
##calculate lexical diversity of all answers across papers/questions
totalWords = len(lemmatizedWords)
uniqueWords = len(set(lemmatizedWords))
lexicalDiversity = uniqueWords / totalWords
print(f"Lexical diversity: {lexicalDiversity}")


### *Keyword search*

In [None]:
##primary keyword searching
import re

keywordList = ["reflex", "reflection", "marginalization", "power",
              "hierarchy", "oppression", "inequity", "historical",
              "social", "cultural", "socio-cultural", "domain",
              "discipline", "institution", "norms", "neutrality",
              "objectivity", "subjectivity", "bias", "positionality",
              "identity", "relationality","disclosure","accountability",
              "transparency"]

for word in keywordList: 
    if word.lower() in allAnswersString.lower():
        print(f"{word} Found")
        count = len(re.findall(rf"{word}", allAnswersString, re.I))
        print(count)
    else:
        print(f"{word} Not Found")
        count = len(re.findall(rf"{word}", allAnswersString, re.I))
        print(count)


In [None]:
##explore keywords in context

def keywordsInContext(text, keyword, windowSize=7): ##windowSize defines how many surrounding words to print
    words = text.split()
    results = []
    for i, word in enumerate(words):
        if word.lower() == keyword.lower():
            start = max(0, i - windowSize)
            end = min(len(words), i + windowSize + 1)
            context = words[start:end]
            results.append(" ".join(context))
    return results

##search specific keywords as needed for in-depth exploration
keywordsInContext(allAnswersString, "powerful")


# 2. Analyze answers from each individual datasheet

In [None]:
##group data by individual papers
paperAnswersString = (
    datasheetsContent.groupby("paperacronym")["answer"]
      .apply(lambda x: " ".join(x.astype(str)))
      .reset_index()
)

###print(paperAnswersString)

In [None]:
##process text data

paperAnswersString["processedTokens"] = paperAnswersString["answer"].apply(process)
####print(paperAnswersString)

groupedTokens = paperAnswersString.groupby('paperacronym')['processedTokens'].sum().reset_index()
###print(groupedTokens)


### *Word Frequency*

In [None]:
##analyze word frequency for each paper

##define function to plot frequency distribution of top 10 words
def plotWordFrequency(tokens, title, num_words=20):
    frequencyDistribution = FreqDist(tokens)
    print(frequencyDistribution.most_common(20))
    plt.figure(figsize=(12,6))
    frequencyDistribution.plot(num_words, cumulative=False)
    plt.title(title)
    plt.show()

##generate frequency plot for each paper
for _, row in groupedTokens.iterrows():
    paper = row['paperacronym']
    tokens = row['processedTokens']
    plotWordFrequency(tokens, f'Word Frequency for {paper}')


### *Collocations*

In [None]:
##get collocations for each paper

##define function to get bigrams
def get_bigram(tokens, top_n):
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(2) ##with minimum frequency of 2
    collocations = finder.nbest(BigramAssocMeasures.pmi, top_n)
    return [(c, finder.ngram_fd[c]) for c in collocations]

##define function to get trigrams
def get_trigram(tokens, top_n):
    finder = TrigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(2)
    collocations = finder.nbest(TrigramAssocMeasures.pmi, top_n)
    return [(c, finder.ngram_fd[c]) for c in collocations]


##for loop; apply the collocations functions to each paper; show top 10 results (top_n)
for _, row in groupedTokens.iterrows():
    paper = row['paperacronym']
    tokens = row['processedTokens']
    
    print(f"\n Paper: {paper} ")
    
    bigrams = get_bigram(tokens, top_n=10)
    print("Top bigrams:")
    for collocation, freq in bigrams:
        print(f"{collocation}: {freq}")
    
    trigrams = get_trigram(tokens, top_n=10)
    print("Top trigrams:")
    for collocation, freq in trigrams:
        print(f"{collocation}: {freq}")

### *Lexical Diversity*

In [None]:
##calculate lexical diversity of answers for each paper

##define function to calculate lexical diversity
def lexicalDiversity(tokens):
    totalWords = len(tokens)
    uniqueWords = len(set(tokens))
    return uniqueWords / totalWords if totalWords > 0 else 0

##apply to each paper
groupedTokens['lexicalDiversity'] = groupedTokens['processedTokens'].apply(lexicalDiversity)

print(groupedTokens[['paperacronym', 'lexicalDiversity']])


### *TF-IDF and Cosine Similarity*

In [None]:
##calculate TF-IDF (term frequency-inverse document frequency) to view word importance in each paper, then use cosine similarity to measure how alike the papers are

##make it into a single string
groupedTokens['text'] = groupedTokens['processedTokens'].apply(lambda x: ' '.join(x))
##generate tf-idf vectors
vectorizer = TfidfVectorizer()
tfidfMatrix = vectorizer.fit_transform(groupedTokens['text'])
##calculate cosine similarity and display as a dataframe
cosineSimilarity = cosine_similarity(tfidfMatrix)
cosineSimilarityDataframe = pd.DataFrame(cosineSimilarity, index=groupedTokens['paperacronym'], columns=groupedTokens['paperacronym'])
#print(cosineSimilarityDataframe)

##style dataframe printout for easier interpretation
def bold_if_above_threshold(value, threshold):
    if value > threshold:
        return 'font-weight: bold'
    else:
        return ''   
styledCosineSimilarityDataframe = cosineSimilarityDataframe.style.applymap(lambda x: bold_if_above_threshold(x, 0.2))
print("Cosine Similarity Across Papers with values > 0.2 bolded:" )
styledCosineSimilarityDataframe


### *Topic Modeling*

In [None]:
##create dictionary for LDA
texts = groupedTokens['processedTokens'].tolist() ##makes each paper's text content into a list of words
dictionary = corpora.Dictionary(texts) ##maps unique IDs to each unique word in the dataset
corpus = [dictionary.doc2bow(text) for text in texts] ##bag of words format; every document (in this case, papers) is represented as a list of each word ID and count of its occurences in that document


##train LDA model
numTopics = 10  ##choose a number of topics to elicit
lda = models.LdaModel(corpus, num_topics=numTopics, id2word=dictionary, random_state=42)

##evaluate LDA model
##first calculate perplexity (lower perplexity = how well our number of topics captures the actual distribution of words in the document = better)
perplexity_score = lda.log_perplexity(corpus)
print(f'Perplexity: {perplexity_score}')
##calculate coherence (higher coherence = higher word similarity across the topic = better)
coherence_model_lda = CoherenceModel(
    model=lda,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence: {coherence_score}')


##view topics (global topics across all papers)
print("\nLDA Topics:")
for idx, topic in lda.print_topics(-1):
    print(f"Topic {idx+1}: {topic}")


In [None]:
import warnings
warnings.filterwarnings('ignore')

##fancy interactive topic modeling viz
####THROWS A LOT OF WARNINGS BECAUSE THE PACKAGE SOURCE CODE USES AN OLD DATETIME VERSION

import pyLDAvis.gensim_models
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis)


In [None]:
##export saliency bar graph to png (recreating in matplotlib)

##get data from pyldavis/gensim
topicInfo = vis.topic_info.copy()
salientTerms = topicInfo[topicInfo["Category"] == "Default"]
##calculate saliency
salientTerms["saliency"] = salientTerms["Total"] * salientTerms["loglift"]
##organize salience values from highest to lowest (***this will make our graph look a little different from the PyLDAvis output because that bar plot sorts most salient terms by frequency)
salientTerms = salientTerms.sort_values(
    by="saliency", ascending=False
).head(30)

##make plot
fig, ax = plt.subplots(figsize=(12, 9), layout="constrained")

ax.barh(
    salientTerms["Term"],
    salientTerms["saliency"]
)
ax.invert_yaxis()
ax.set_title("Most Salient Terms", fontsize=28, pad=30)
ax.set_xlabel("Saliency", fontsize=18)
ax.set_ylabel("Term", fontsize=18)
ax.tick_params(axis="x", labelsize=14)
ax.tick_params(axis="y", labelsize=14)

##save image as png
fig.savefig("saliency.png", dpi=300)

plt.show()




In [None]:
##plot proportion of topics in each paper

paperTopics = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus]

##make dataframe of the probability of each topic occuring in each document
topicDistributions = pd.DataFrame(
    [[topic_prob for topic_id, topic_prob in doc] for doc in paperTopics],
    columns=[f'Topic {i+1}' for i in range(numTopics)]
)

##identifying each document by paperacronym
topicDistributions['paperacronym'] = groupedTokens['paperacronym'].values
##set paperacronym as index
topicDistributions.set_index('paperacronym', inplace=True)

##make stacked barplot
topicDistributions.plot(kind='bar', stacked=True, figsize=(12,6), colormap='tab10')
plt.ylabel("Proportion of Topic")
plt.xlabel("Paper")
plt.title("Topics by Paper")
plt.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 3. Analyze answers on a question-by-question basis

In [None]:
##group data by questions
questionAnswersString = (
    datasheetsContent.groupby("questionum")["answer"]
      .apply(lambda x: " ".join(x.astype(str)))
      .reset_index()
)

###print(questionAnswersString)

In [None]:
##process text data

questionAnswersString["processedTokens"] = questionAnswersString["answer"].apply(process)
###print(groupedAnswersString)

groupedQuestionTokens = questionAnswersString.groupby('questionum')['processedTokens'].sum().reset_index()
###print(groupedQuestionTokens)


### *Word Frequency*

In [None]:
##analyze word frequency for each question

##define function to plot frequency distribution of top 10 words
def plotWordFrequency(tokens, title, num_words=20):
    frequencyDistribution = FreqDist(tokens)
    print(frequencyDistribution.most_common(20))
    plt.figure(figsize=(12,6))
    frequencyDistribution.plot(num_words, cumulative=False)
    plt.title(title)
    plt.show()

##generate frequency plot for each question
for _, row in groupedQuestionTokens.iterrows():
    question = row['questionum']
    tokens = row['processedTokens']
    plotWordFrequency(tokens, f'Word Frequency for Question {question} Answers')
    

### *Collocations*

In [None]:
##get collocations for each question

##define function to get bigrams
def get_bigrams(tokens, top_n=10):
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(2)
    bigrams = finder.nbest(BigramAssocMeasures.pmi, top_n)
    return [(b, finder.ngram_fd[b]) for b in bigrams]

##define function to get trigrams
def get_trigrams(tokens, top_n=10):
    finder = TrigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(2)
    trigrams = finder.nbest(TrigramAssocMeasures.pmi, top_n)
    return [(t, finder.ngram_fd[t]) for t in trigrams]


for _, row in groupedQuestionTokens.iterrows():
    question = row['questionum']
    tokens = row['processedTokens']
    
    print(f"\n Question {question}:")
    
    bigrams = get_bigrams(tokens, top_n=10)
    print("Top bigrams:")
    for collocation, freq in bigrams:
        print(f"{collocation}: {freq}")
    
    trigrams = get_trigrams(tokens, top_n=10)
    print("Top trigrams:")
    for collocation, freq in trigrams:
        print(f"{collocation}: {freq}")

### *Lexical Diversity*

In [None]:
##calculate lexical diversity of answers for each question

##define function to calculate lexical diversity
def lexicalDiversity(tokens):
    totalWords = len(tokens)
    uniqueWords = len(set(tokens))
    return uniqueWords / totalWords if totalWords > 0 else 0

##apply to each paper
groupedQuestionTokens['lexicalDiversity'] = groupedQuestionTokens['processedTokens'].apply(lexicalDiversity)

print(groupedQuestionTokens[['questionum', 'lexicalDiversity']])


### *TF-IDF and Cosine Similarity*

In [None]:
##calculate TF-IDF (term frequency-inverse document frequency) to view word importance for each question (ACROSS ALL PAPERS), then use cosine similarity to measure how alike the questions are

##make it into a single string
groupedQuestionTokens['text'] = groupedQuestionTokens['processedTokens'].apply(lambda x: ' '.join(x))
##generate tf-idf vectors
vectorizer = TfidfVectorizer()
tfidfMatrix = vectorizer.fit_transform(groupedQuestionTokens['text'])
##calculate cosine similarity and display as a dataframe
cosineSimilarityQuestions = cosine_similarity(tfidfMatrix)
cosineSimilarityQuestionsDataframe = pd.DataFrame(cosineSimilarityQuestions, index=groupedQuestionTokens['questionum'], columns=groupedQuestionTokens['questionum'])
##style dataframe printout for easier interpretation
def bold_if_above_threshold(value, threshold):
    if value > threshold:
        return 'font-weight: bold'
    else:
        return ''   
styledCosineSimilarityQuestionsDataframe = cosineSimilarityQuestionsDataframe.style.applymap(lambda x: bold_if_above_threshold(x, 0.3))
print("Cosine Similarity Across Questions with values > 0.3 bolded:" )
styledCosineSimilarityQuestionsDataframe


### *Topic Modeling*

In [None]:
##create dictionary for LDA ACROSS EACH QUESTION THIS TIME
texts = groupedQuestionTokens['processedTokens'].tolist() ##makes each question's total answer text content into a list of words
dictionary = corpora.Dictionary(texts) ##maps unique IDs to each unique word in the dataset
corpus = [dictionary.doc2bow(text) for text in texts] ##bag of words format; every document (in this case, questions) is represented as a list of each word ID and count of its occurences in that document


##train LDA model
numTopics = 7  ##choose a number of topics to elicit
lda = models.LdaModel(corpus, num_topics=numTopics, id2word=dictionary, random_state=42)

##evaluate LDA model
##first calculate perplexity (lower perplexity = how well our number of topics captures the actual distribution of words in the document = better)
perplexity_score = lda.log_perplexity(corpus)
print(f'Perplexity: {perplexity_score}')
##calculate coherence (higher coherence = higher word similarity across the topic = better)
coherence_model_lda = CoherenceModel(
    model=lda,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence: {coherence_score}')

##view topics (global topics across all questions)
print("\nLDA Topics:")
for idx, topic in lda.print_topics(-1):
    print(f"Topic {idx+1}: {topic}")

In [None]:
import warnings
warnings.filterwarnings('ignore')

##fancy interactive topic modeling viz
####THROWS A LOT OF WARNINGS BECAUSE THE PACKAGE SOURCE CODE USES AN OLD DATETIME VERSION

import pyLDAvis.gensim_models
import pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
##plot proportion of topics across each questions

questionTopics = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus]

##make dataframe of the probability of each topic occuring in each document
questionTopicDistributions = pd.DataFrame(
    [[topic_prob for topic_id, topic_prob in doc] for doc in questionTopics],
    columns=[f'Topic {i+1}' for i in range(numTopics)]
)

##identifying each document by questionnum
questionTopicDistributions['questionum'] = groupedQuestionTokens['questionum'].values
##set questionnum as index
questionTopicDistributions.set_index('questionum', inplace=True)

##make stacked barplot
questionTopicDistributions.plot(kind='bar', stacked=True, figsize=(12,6), colormap='tab10')
plt.ylabel("Proportion of Topic")
plt.xlabel("Question")
plt.title("Topics by Question")
plt.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()