In [None]:
%%capture

!rm -rf ObjectEllicitationNLP
!git clone https://github.com/yurisugano/ObjectEllicitationNLP.git
!pip install python-docx
!pip install nltk
!pip install "git+https://github.com/samwaterbury/rpunct.git"
!pip install bertopic
!pip install gensim
!pip install collections

# Analysis

The notebook is divided into **word token** analysis and **sentence token** analysis.

First, load packages and data. To remove inconsistencies with punctuation, all punctuation is removed and all words are turned to lower case.

In [6]:
# Import necessary packages
import re
from docx import Document
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rpunct import RestorePuncts
from collections import defaultdict


# Load raw data
raw_data = Document('/content/ObjectEllicitationNLP/Transcripts.docx')

# Define functions to handle formatting
def remove_unwanted_chars(paragraph_text):
    """
    Input: docx paragraph object

    Remove all curly braces, square brackets, punctuation and turn all sentences to lower case.

    Output: docx paragraph object formatted
    """
    paragraph_text = re.sub(r'[{}\[\]]', '', paragraph_text)  # remove curly braces and square brackets
    paragraph_text = re.sub(r'[^\w\s]', '', paragraph_text)  # remove punctuation
    return paragraph_text.lower()


# Create data object
data = defaultdict(str)

# Process paragraphs and return statements
for paragraph in raw_data.paragraphs:
    # Convert paragraph to string
    text = str(paragraph.text)

    # Split the paragraph text into subject and statement
    split_text = text.split(": ", 1)
    # Verify that subject is a 3 digit number before ":", otherwise ignore the ":"
    if len(split_text) == 2 and re.match(r'\{\d{3}\}', split_text[0]):
        subject, statement = split_text
        subject = subject[1:-1]

        # Concatenate statements for the same subject
        data[subject] += remove_unwanted_chars(statement)

First, some summaries:

In [7]:
print(f"Number of subjects: {len(data.keys())}")


for subject, statements in data.items():
    print(f"{subject}: {len(statements)} words.")
    # Add the sentences to a dictionary entry

    data[subject] = {'statements': statements}


# Delete the interviewer
del data['000']

Number of subjects: 33
000: 55747 words.
104: 9640 words.
105: 15819 words.
106: 7681 words.
107: 12764 words.
108: 9558 words.
109: 12522 words.
110: 11473 words.
111: 14912 words.
112: 20789 words.
113: 6861 words.
114: 4754 words.
115: 8979 words.
116: 4892 words.
117: 7354 words.
118: 7364 words.
119: 6964 words.
120: 7701 words.
221: 320 words.
121: 4930 words.
122: 8776 words.
123: 4812 words.
124: 6849 words.
126: 9737 words.
127: 7428 words.
128: 23008 words.
130: 12011 words.
131: 15055 words.
132: 19491 words.
133: 8828 words.
134: 8678 words.
135: 15104 words.
136: 9251 words.


## Analysis with word tokens

Input: a string with all the statements by the subject.


### Pre-processing pipeline

1. Remove punctuation
2. Case-folding
3. Tokenization
4. Lemmatization
5. Stop-word removal
6. Disfluency removal

### Analysis:
1. Most frequent words per corpus
2. Most frequent words per subject
3. LDA vs BERTopic for topic modeling


In [12]:
import nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def track_made_up_words(text):
    # List of common English words
    english_words = set(w.lower() for w in nltk.corpus.words.words())

    # Find made-up words
    made_up_words = [word for word in text if word not in english_words]
    return made_up_words

def process_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # lemmatize words
    tokens = [token for token in tokens if token not in stop_words]  # remove stop words
    tokens = [token for token in tokens if token not in ['uh', 'um']]  # remove disfluencies
    return tokens

# Process each subject's statements
for subject, subject_data in data.items():
    subject_data['tokens'] = process_text(subject_data['statements'])
    subject_data['made_up_words'] = track_made_up_words(subject_data['tokens'])

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [1]:
data['107']['made_up_words'][:100]

NameError: ignored

In [14]:
from collections import Counter
from gensim import corpora, models
from bertopic import BERTopic
import numpy as np

# Define the number of most frequent words to return
n = 10

# Extract all tokens from each subject and create a single object 'all_tokens'
all_tokens = [token for subject in data.values() for token in subject['tokens']]

# Get the top n most frequent words for the entire document
counter = Counter(all_tokens)
print(counter.most_common(n))

# Get the top n most frequent words for each subject
for subject, info in data.items():
    counter = Counter(info['tokens'])
    print(f"Subject: {subject}, Top {n} words: {counter.most_common(n)}")

[('like', 3429), ('one', 1022), ('kind', 579), ('feel', 478), ('dont', 404), ('think', 386), ('yeah', 375), ('know', 365), ('ha', 323), ('little', 310)]
Subject: 104, Top 10 words: [('like', 102), ('feel', 52), ('one', 35), ('kind', 24), ('smooth', 16), ('soft', 16), ('little', 14), ('rough', 12), ('also', 11), ('really', 11)]
Subject: 105, Top 10 words: [('like', 343), ('kind', 46), ('yeah', 29), ('feel', 27), ('one', 25), ('know', 22), ('ha', 22), ('dont', 21), ('thing', 18), ('maybe', 18)]
Subject: 106, Top 10 words: [('like', 121), ('guess', 25), ('think', 23), ('yeah', 22), ('one', 21), ('feel', 16), ('dont', 14), ('know', 11), ('218', 10), ('two', 9)]
Subject: 107, Top 10 words: [('like', 115), ('one', 57), ('kind', 45), ('feel', 26), ('would', 20), ('dont', 18), ('thing', 17), ('im', 14), ('side', 14), ('ha', 13)]
Subject: 108, Top 10 words: [('like', 77), ('one', 65), ('ha', 26), ('yeah', 18), ('side', 18), ('kind', 17), ('silver', 16), ('thats', 14), ('little', 12), ('gold', 1


# Perform LDA
dictionary = corpora.Dictionary([info['tokens'] for info in data.values()])
corpus = [dictionary.doc2bow(info['tokens']) for info in data.values()]
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=2)
lda_topics = lda_model.print_topics(num_words=5)
print(lda_topics)

# Perform BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(all_tokens)
topic_freq = topic_model.get_topic_freq()
print(topic_freq)


## Analysis with sentence tokens

Input: a string with all the stat

In [15]:
%%capture

# Step 7: Restore punctuation and capitalize words, then tokenize sentences
rpunct = RestorePuncts()

In [16]:
for subject, subject_data in data.items():
    punctuated_text = rpunct.punctuate(subject_data['statements'])
    subject_data['sentence-tokens'] = sent_tokenize(punctuated_text)

In [17]:
print(data['105']['sentence-tokens'][:10])

['Ok is my bag and stuff Ok. throws 203 at the wall and it falls off.', 'Ok so its like sticks to like hard surfaces and like I said not always the best.', 'but when I was a kid and I would play with them and I was alwaysi would always like to do this rolls ball on table so it sticks and unsticks.', 'It makes popping sound and like make popcorn noises.', 'So its also different colors red, blue and green.', 'My hair is in it now its ok but this is cool too.', 'Its also a sphere but in like a subtle way because the middle is like a hard little circle and then this forms a circle.', 'But its like all these little things Ok ooh this 202 reminds me of like a pencil eraser almost just in the way it looks and like the way it feels.', 'Im not super familiar with a bouncy ball like this, its kind of like softer than a normal bouncy ball.', 'like I feel like if this hit me it like wouldnt hurt as much.']


In [21]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN


from sentence_transformers import SentenceTransformer

# Extract all tokens from each subject and create a single object 'all_tokens'
sentence_tokens = [token for subject in data.values() for token in subject['sentence-tokens']]

# Pre-calculate embeddings
embedding_allMini = SentenceTransformer("all-MiniLM-L6-v2")
embeddings_allMini = embedding_allMini.encode(sentence_tokens, show_progress_bar=True)

embedding_roberta = SentenceTransformer("roberta-base-nli-mean-tokens")
embeddings_roberta = embedding_roberta.encode(sentence_tokens, show_progress_bar=True)

# Define models
umap_model = UMAP(n_neighbors=8, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, cluster_selection_epsilon=0.05, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Improve topic representation
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_allMini,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)


Batches:   0%|          | 0/168 [00:00<?, ?it/s]

Batches:   0%|          | 0/168 [00:00<?, ?it/s]

In [23]:
topics, probs = topic_model.fit_transform(sentence_tokens, embeddings_allMini)
# topics_roberta, probs_roberta = topic_model.fit_transform(sentence_tokens, embeddings_roberta)


2023-07-18 16:45:16,279 - BERTopic - Reduced dimensionality


TypeError: ignored

In [None]:
topics_

In [None]:
import nltk
import re
from docx import Document

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



raw_data = Document('/content/ObjectEllicitationNLP/Transcripts.docx')

def add_curly_braces(paragraph_text):
    """Add curly braces to three-digit numbers not surrounded by square brackets or curly braces."""
    numbers = re.findall(r'(?<![\[{])\b(\d{3})\b(?![\]}])', paragraph_text)
    for number in numbers:
        transformed_number = '{' + number + '}'
        paragraph_text = re.sub(r'\b' + number + r'\b', transformed_number, paragraph_text)
    return paragraph_text

def handle_square_brackets(paragraph_text):
    """Handle numbers inside square brackets with optional spaces and dashes."""
    matches = re.findall(r'\[([\d\s,-]+)\]', paragraph_text)
    for match in matches:
        numbers = []
        for num_range in re.split(r',\s*|\s+', match):
            num_range = num_range.strip()
            if '-' in num_range:
                start, end = num_range.split('-')
                numbers.extend(range(int(start), int(end) + 1))
            else:
                numbers.append(int(num_range))

        transformed = '[' + ']['.join(map(str, numbers)) + ']'
        paragraph_text = paragraph_text.replace('[' + match + ']', transformed)
    return paragraph_text

for each_paragraph in raw_data.paragraphs:
    each_paragraph.text = add_curly_braces(each_paragraph.text)
    each_paragraph.text = handle_square_brackets(each_paragraph.text)

def extract_info(paragraph_text):
    """Extract speaker and sentences from a paragraph text."""
    speaker_match = re.search(r'\{(\d{3})\}', paragraph_text)
    sentence_match = re.search(r': (.*)', paragraph_text)

    if speaker_match and sentence_match:
        speaker = speaker_match.group(1)
        sentence = sentence_match.group(1)
        return speaker, sentence
    else:
        return None, None


def update_data(data, speaker, sentence):
    """Update data dictionary with extracted speaker, sentence."""
    if speaker:
        if speaker in data:
            data[speaker]['statements'] = ''.join(data[speaker]['statements']) + sentence

        else:
            data[speaker] = {'statements': [statement]}

data = {}

for each_paragraph in raw_data.paragraphs:
    speaker, statement = extract_info(each_paragraph.text)
    update_data(data, speaker, statement)

del data['000']

In [None]:
def handle_disfluencies(text):
    # List of common disfluencies
    disfluencies = ['uh', 'um', 'like', 'you know', 'so', 'actually', 'basically', 'seriously', 'literally']

    # Tokenize the text
    words = word_tokenize(text)

    # Remove disfluencies
    words = [word for word in words if word not in disfluencies]

    return ' '.join(words)


def remove_capitalization_and_punctuation(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

def add_capitalization_and_punctuation(text):
    tokenizer = T5Tokenizer.from_pretrained('SJ-Ray/Re-Punctuate')
    model = TFT5ForConditionalGeneration.from_pretrained('SJ-Ray/Re-Punctuate')

    inputs = tokenizer.encode("punctuate: " + text, return_tensors="tf")
    result = model.generate(inputs)
    decoded_output = tokenizer.decode(result[0], skip_special_tokens=True)

    return(decoded_output)


In [None]:
import nltk
nltk.download('words')

for subject, subject_data in data.items():
    text = subject_data['statements']
    made_up_words = track_made_up_words(text, subject)
    text = handle_disfluencies(text)
    text = remove_capitalization_and_punctuation(text)
    text = add_capitalization_and_punctuation(text)

    subject_data['statements'] = text
    subject_data['made-up-words'] = made_up_words

data


# LDA



In [None]:
import string
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np
from transformers import T5Tokenizer, TFT5ForConditionalGeneration


In [None]:
from nltk.tokenize import sent_tokenize

sent_tokenize("this is a sentence and this might be another one however I dont know why not maybe i still want to go to the mall yesterday i don't think i tried")


In [None]:
!python --version

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer

def get_bert_input(data):
    input = []
    for subject, subject_data in data.items():
        statement = subject_data["statements"]
        statement_id = []

        for sentence in statement:
            statement_id.append(f"{subject} {sentence}")

        sentence_tokens = [sentence for statement in statement for sentence in sent_tokenize(statement)]
        input.extend(sentence_tokens)
    return(input)

docs = get_bert_input(data)


In [None]:

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)