In [None]:
import os
import re
import numpy as np
from pprint import pprint
from collections import defaultdict

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy for lemmatization
import spacy

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Define stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Function to generate bigrams
def generate_bigram(data):
    bigram = gensim.models.Phrases(data, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in data]

In [None]:
# Function to preprocess transcript
def preprocess_transcript(transcript):
    data = transcript.split('\n')

    # Initialize spacy
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

    # Initialize dictionary to hold words by speaker
    speaker_words = defaultdict(lambda: defaultdict(int))
    contributions = []

    # Function for lemmatization
    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        lemmatized_words = defaultdict(int)
        for sent in texts:
            doc = nlp(" ".join(sent))
            for token in doc:
                if token.pos_ in allowed_postags:
                    lemma = token.lemma_
                    original_word = token.text
                    if lemma != original_word:
                        # If lemma is different from the original word, keep both
                        lemmatized_words[original_word] += 1
                    else:
                        lemmatized_words[lemma] += 1
        return lemmatized_words

    # Process each line to group words by speaker
    for line in data:
        if line.strip():
            parts = line.split('\t', 4)
            if len(parts) > 3:
                name = parts[3].strip().upper()
                dialogue = parts[-1].strip()

                # Replace hyphens followed by a space with a special character
                dialogue = re.sub(r'(?<=\w)-\s(?=\w)', 'HY-', dialogue)
                dialogue_words = [word for word in word_tokenize(dialogue) if word.lower() not in stop_words]
                # Restore hyphens in words
                dialogue_words = [word.replace('HY-', '-') for word in dialogue_words]

                # Generate bigrams
                dialogue_bigrams = generate_bigram([dialogue_words])[0]

                # Lemmatize
                lemmatized_words_dict = lemmatization([dialogue_bigrams], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

                # Add words to speaker's entry in the dictionary
                for lemma, count in lemmatized_words_dict.items():
                    for word in dialogue_words:
                        if lemma in word:
                            speaker_words[name][word] += 1
                            break
                    else:
                        speaker_words[name][lemma] += 1

                contributions.append((name, dialogue_words))

    # Flatten texts for overall preprocessed text
    preprocessed_text = [word for words_dict in speaker_words.values() for word in words_dict.keys()]

    return preprocessed_text, speaker_words, contributions

In [None]:
# Function to calculate participation measures
def calculate_participation_measures(contributions):
    participants = set(name for name, _ in contributions)
    n = len(contributions)
    k = len(participants)

    participation = {name: [0]*n for name in participants}
    for t, (name, _) in enumerate(contributions):
        participation[name][t] = 1

    # Number of contributions
    num_contributions = {name: sum(seq) for name, seq in participation.items()}

    # Sample mean participation
    mean_participation = {name: count/n for name, count in num_contributions.items()}

    # Sample variance in participation
    var_participation = {
        name: sum((p - mean_participation[name])**2 for p in seq) / (n - 1)
        for name, seq in participation.items()
    }

    # Group-relative mean participation
    group_relative_mean = {
        name: mean_participation[name] - 1/k
    for name in participants
    }

    # Cross-correlation
    def cross_correlation(a, b, tau):
        seq_a = participation[a]
        seq_b = participation[b]
        mean_a = mean_participation[a]
        mean_b = mean_participation[b]
        sigma_a = np.sqrt(var_participation[a])
        sigma_b = np.sqrt(var_participation[b])

        return sum(
            (seq_a[t] * seq_b[t-tau] - n * mean_a * mean_b)
            for t in range(tau, n)
        ) / ((n - 1) * sigma_a * sigma_b)

    cross_corr = {
        (a, b): [cross_correlation(a, b, tau) for tau in range(n)]
        for a in participants for b in participants if a != b
    }

    return {
        'num_contributions': num_contributions,
        'mean_participation': mean_participation,
        'var_participation': var_participation,
        'group_relative_mean': group_relative_mean,
        'cross_correlation': cross_corr
    }

In [None]:
# Function to parse text from .txt file
def parse_text(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Main function
def main():
    # Input file path
    file_path = input("Enter the path to the .txt file: ")
    # Parse text from file
    text = parse_text(file_path)
    # Preprocess text
    preprocessed_text, speaker_words, contributions = preprocess_transcript(text)
    print("Preprocessed text:")
    pprint(preprocessed_text)
    print("\nSpeaker Words:")
    pprint({speaker: dict(words) for speaker, words in speaker_words.items()})

    # Calculate participation measures
    participation_measures = calculate_participation_measures(contributions)
    print("\nParticipation Measures:")
    pprint(participation_measures)

In [None]:
# Call main function
main()

Enter the path to the .txt file: 100_movie_lines.txt
Preprocessed text:
['hope',
 'Let',
 'go',
 'gon',
 'need',
 'learn',
 'lie',
 "'m",
 'kidding',
 'know',
 'sometimes',
 'become',
 'persona',
 'quit',
 'fear',
 'wearing',
 'pastels',
 'good',
 'stuff',
 'endless',
 'blonde',
 'babble',
 'boring',
 'listen',
 'crap',
 'says',
 'lighter',
 'look',
 'extra',
 'Tons',
 'knows',
 "'ve",
 'ever',
 'heard',
 'say',
 'dip',
 'dating',
 'guy',
 'smokes',
 'found',
 'picture',
 'drawers',
 'pretty',
 'sure',
 'harboring',
 'same-sex',
 'tendencies',
 'really',
 'wan',
 'sister',
 'goes',
 'days',
 'never',
 'every',
 'use',
 'blowdryer',
 'attachment',
 'sweet',
 'counted',
 'help',
 'cause',
 'thug',
 'obviously',
 'failing',
 'going',
 'date',
 'little',
 'Find',
 'plan',
 'progressing',
 'Forget',
 'want',
 'though',
 'useful',
 'things',
 'stores',
 'much',
 'champagne',
 'cost',
 'Stuff',
 'life',
 'point',
 'head',
 "C'esc",
 'tete',
 'find',
 'boyfriend',
 'mystery',
 'used',
 'popula