<a href="https://colab.research.google.com/github/yurisugano/Affective-Empathy-in-Rats-BehavioralAnalysis/blob/main/2023_ObjectEllicitationAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture

!rm -rf ObjectEllicitationNLP
!git clone https://github.com/yurisugano/ObjectEllicitationNLP.git
!pip install python-docx
!pip install nltk
!pip install "git+https://github.com/samwaterbury/rpunct.git"
!pip install bertopic
!pip install gensim
!pip install collections

# Analysis

The notebook is divided into **word token** analysis and **sentence token** analysis.

First, load packages and data. To remove inconsistencies with punctuation, all punctuation is removed and all words are turned to lower case.

In [None]:
# Import necessary packages
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rpunct import RestorePuncts
from docx import Document

nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load raw data
raw_data = Document('/content/ObjectEllicitationNLP/Transcripts.docx')

# Prepare the data object
data = {}

# Process each paragraph
for paragraph in raw_data.paragraphs:

    # Convert the paragraph to a string
    text = str(paragraph.text)

    # Define a pattern for subjects
    subject_pattern = re.compile(r'\{\d{3}\}')

    # Check if the paragraph begins with the correct format
    if subject_pattern.match(text):
        # Remove unwanted characters
        text = re.sub(r'[\[\]{}]', '', text)
        text = re.sub(r'[^\w\s:]', '', text)

        # Split the paragraph into subject and statement
        split_text = text.split(': ', 1)
        if len(split_text) == 2:
            subject, sentence = split_text

            # If subject is not in data, add it
            if subject not in data:
                data[subject] = {'statements': sentence.lower()}
            else:
                data[subject]['statements'] += sentence.lower()

# Now, data is a dictionary of the form {subject: {statements}}


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


First, some summaries:

In [None]:
print(f"Number of subjects: {len(data.keys())}")

# Delete the interviewer
del data['000']

Number of subjects: 33


## Analysis with word tokens

Input: a string with all the statements by the subject.


### Pre-processing pipeline

1. Remove punctuation
2. Case-folding
3. Tokenization
4. Lemmatization
5. Stop-word removal
6. Disfluency removal

### Analysis:
1. Most frequent words per corpus
2. Most frequent words per subject
3. LDA vs BERTopic for topic modeling


In [None]:
import nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data['107']['made_up_words'][:100]

KeyError: ignored

In [None]:
from collections import Counter
from gensim import corpora, models
from bertopic import BERTopic
import numpy as np

# Define the number of most frequent words to return
n = 10

# Extract all tokens from each subject and create a single object 'all_tokens'
all_tokens = [token for subject in data.values() for token in subject['tokens']]

# Get the top n most frequent words for the entire document
counter = Counter(all_tokens)
print(counter.most_common(n))

# Get the top n most frequent words for each subject
for subject, info in data.items():
    counter = Counter(info['tokens'])
    print(f"Subject: {subject}, Top {n} words: {counter.most_common(n)}")

[('like', 3429), ('one', 1022), ('kind', 579), ('feel', 478), ('dont', 404), ('think', 386), ('yeah', 375), ('know', 365), ('ha', 323), ('little', 310)]
Subject: 104, Top 10 words: [('like', 102), ('feel', 52), ('one', 35), ('kind', 24), ('smooth', 16), ('soft', 16), ('little', 14), ('rough', 12), ('also', 11), ('really', 11)]
Subject: 105, Top 10 words: [('like', 343), ('kind', 46), ('yeah', 29), ('feel', 27), ('one', 25), ('know', 22), ('ha', 22), ('dont', 21), ('thing', 18), ('maybe', 18)]
Subject: 106, Top 10 words: [('like', 121), ('guess', 25), ('think', 23), ('yeah', 22), ('one', 21), ('feel', 16), ('dont', 14), ('know', 11), ('218', 10), ('two', 9)]
Subject: 107, Top 10 words: [('like', 115), ('one', 57), ('kind', 45), ('feel', 26), ('would', 20), ('dont', 18), ('thing', 17), ('im', 14), ('side', 14), ('ha', 13)]
Subject: 108, Top 10 words: [('like', 77), ('one', 65), ('ha', 26), ('yeah', 18), ('side', 18), ('kind', 17), ('silver', 16), ('thats', 14), ('little', 12), ('gold', 1

In [None]:

# Perform LDA
dictionary = corpora.Dictionary([subject_data['tokens'] for subject_data in data.values()])
corpus = [dictionary.doc2bow(subject_data['tokens']) for subject_data in data.values()]
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=2)
lda_topics = lda_model.print_topics(num_words=5)
print(lda_topics)


NameError: ignored

In [None]:

# Perform BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1062 [00:00<?, ?it/s]

2023-07-19 19:16:55,366 - BERTopic - Transformed documents to Embeddings
2023-07-19 19:20:52,860 - BERTopic - Reduced dimensionality


KeyboardInterrupt: ignored

In [None]:

topics, probs = topic_model.fit_transform(all_tokens)
topic_freq = topic_model.get_topic_freq()
print(topic_freq)


Batches:   0%|          | 0/1062 [00:00<?, ?it/s]

2023-07-19 20:04:55,964 - BERTopic - Transformed documents to Embeddings
2023-07-19 20:11:25,590 - BERTopic - Reduced dimensionality


## Analysis with sentence tokens

Input: a string with all the stat

In [None]:
%%capture
from rpunct import RestorePuncts

In [None]:

# Restore punctuation and capitalize words, then tokenize sentences
rpunct = RestorePuncts()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

In [None]:
import nltk
nltk.download('punkt')


for subject, subject_data in data.items():
    punctuated_text = rpunct.punctuate(subject_data['statements'])
    subject_data['sentence-tokens'] = sent_tokenize(punctuated_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

sentence_tokens = [token for subject in data.values() for token in subject['sentence-tokens']]


In [None]:
import json

with open('data.json', 'w') as f:
    json.dump(data, f)

with open('sent_tokens.json', 'w') as f:
    json.dump(sentence_tokens, f)


NameError: ignored

In [None]:
import json

with open('data.json', 'w') as f:
    json.dump(data, f)

with open('sent_tokens.json', 'w') as f:
    json.dump(sentence_tokens, f)

NameError: ignored

In [None]:
from bertopic import BERTopic
from umap import UMAP
!pip install --upgrade git+https://github.com/scikit-learn-contrib/hdbscan.git
from hdbscan import HDBSCAN



defaultdict(<class 'str'>, {'104': {'statements': 'sounds good i didnt even see that bag this is set 1 this one 204 feels rougher but pretty uniform it all kind of feels the same it doesnt roll very well and it kind of thumpsdo you want me to talk louderum and it kind of thumps when you throw it it doesnt bounce or anything then this one 201 is pokey spiky ooh and it lights up when you bounce it it also doesnt bounce that much but still it feels really artificial like kind of slimy almost um like i definitely wouldnt put that in my mouth this one 203 has little suction cups all the way around whichoh i like the sound it makeslittle popand it does roll well but its not satisfying at all yeah this one 202 is my favorite its its smooth and soft and it bounces well and its heavy like its solid and its a little squishy if i like put some effort into it and yeah so this one i think rolls well then the suction cup 203 does too but again just not satisfying i like this one 201 this feels like 

from sentence_transformers import SentenceTransformer

data = dict(data)


sentence_tokens = []

for subject, subject_data in data.items():
    print(subject_data['sentence-tokens'])



In [None]:

# Pre-calculate embeddings
# embedding_allMini = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings_allMini = embedding_allMini.encode(sentence_tokens, show_progress_bar=True)

embedding_roberta = SentenceTransformer("roberta-base-nli-mean-tokens")
embeddings_roberta = embedding_roberta.encode(sentence_tokens, show_progress_bar=True)


In [None]:

# Define models
umap_model = UMAP(n_neighbors=8, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, cluster_selection_epsilon=0.05, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Improve topic representation
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_roberta,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,

  # Hyperparameters
  top_n_words=5,
  verbose=True
)


In [None]:
topics, probs = topic_model.fit_transform(sentence_tokens, embeddings_roberta)
# topics_roberta, probs_roberta = topic_model.fit_transform(sentence_tokens, embeddings_roberta)

NameError: ignored

In [None]:
topics_

In [None]:
import nltk
import re
from docx import Document

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



raw_data = Document('/content/ObjectEllicitationNLP/Transcripts.docx')

def add_curly_braces(paragraph_text):
    """Add curly braces to three-digit numbers not surrounded by square brackets or curly braces."""
    numbers = re.findall(r'(?<![\[{])\b(\d{3})\b(?![\]}])', paragraph_text)
    for number in numbers:
        transformed_number = '{' + number + '}'
        paragraph_text = re.sub(r'\b' + number + r'\b', transformed_number, paragraph_text)
    return paragraph_text

def handle_square_brackets(paragraph_text):
    """Handle numbers inside square brackets with optional spaces and dashes."""
    matches = re.findall(r'\[([\d\s,-]+)\]', paragraph_text)
    for match in matches:
        numbers = []
        for num_range in re.split(r',\s*|\s+', match):
            num_range = num_range.strip()
            if '-' in num_range:
                start, end = num_range.split('-')
                numbers.extend(range(int(start), int(end) + 1))
            else:
                numbers.append(int(num_range))

        transformed = '[' + ']['.join(map(str, numbers)) + ']'
        paragraph_text = paragraph_text.replace('[' + match + ']', transformed)
    return paragraph_text

for each_paragraph in raw_data.paragraphs:
    each_paragraph.text = add_curly_braces(each_paragraph.text)
    each_paragraph.text = handle_square_brackets(each_paragraph.text)

def extract_info(paragraph_text):
    """Extract speaker and sentences from a paragraph text."""
    speaker_match = re.search(r'\{(\d{3})\}', paragraph_text)
    sentence_match = re.search(r': (.*)', paragraph_text)

    if speaker_match and sentence_match:
        speaker = speaker_match.group(1)
        sentence = sentence_match.group(1)
        return speaker, sentence
    else:
        return None, None


def update_data(data, speaker, sentence):
    """Update data dictionary with extracted speaker, sentence."""
    if speaker:
        if speaker in data:
            data[speaker]['statements'] = ''.join(data[speaker]['statements']) + sentence

        else:
            data[speaker] = {'statements': [statement]}

data = {}

for each_paragraph in raw_data.paragraphs:
    speaker, statement = extract_info(each_paragraph.text)
    update_data(data, speaker, statement)

del data['000']

In [None]:
def handle_disfluencies(text):
    # List of common disfluencies
    disfluencies = ['uh', 'um', 'like', 'you know', 'so', 'actually', 'basically', 'seriously', 'literally']

    # Tokenize the text
    words = word_tokenize(text)

    # Remove disfluencies
    words = [word for word in words if word not in disfluencies]

    return ' '.join(words)


def remove_capitalization_and_punctuation(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

def add_capitalization_and_punctuation(text):
    tokenizer = T5Tokenizer.from_pretrained('SJ-Ray/Re-Punctuate')
    model = TFT5ForConditionalGeneration.from_pretrained('SJ-Ray/Re-Punctuate')

    inputs = tokenizer.encode("punctuate: " + text, return_tensors="tf")
    result = model.generate(inputs)
    decoded_output = tokenizer.decode(result[0], skip_special_tokens=True)

    return(decoded_output)


In [None]:
import nltk
nltk.download('words')

for subject, subject_data in data.items():
    text = subject_data['statements']
    made_up_words = track_made_up_words(text, subject)
    text = handle_disfluencies(text)
    text = remove_capitalization_and_punctuation(text)
    text = add_capitalization_and_punctuation(text)

    subject_data['statements'] = text
    subject_data['made-up-words'] = made_up_words

data


# LDA



In [None]:
import string
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np
from transformers import T5Tokenizer, TFT5ForConditionalGeneration


In [None]:
from nltk.tokenize import sent_tokenize

sent_tokenize("this is a sentence and this might be another one however I dont know why not maybe i still want to go to the mall yesterday i don't think i tried")


In [None]:
!python --version

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from sentence_transformers import SentenceTransformer

def get_bert_input(data):
    input = []
    for subject, subject_data in data.items():
        statement = subject_data["statements"]
        statement_id = []

        for sentence in statement:
            statement_id.append(f"{subject} {sentence}")

        sentence_tokens = [sentence for statement in statement for sentence in sent_tokenize(statement)]
        input.extend(sentence_tokens)
    return(input)

docs = get_bert_input(data)


In [None]:

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)