In [1]:
# import necessary modules
from convokit import Corpus, TextCleaner
import numpy as np, pandas as pd, os

In [3]:
# Change the path, csv name and study code as appropriate
path = os.getcwd()[:-10] # the path to the github repo
master_csv = 'master.csv'
STUDY_CODE = 'AD'
# read in the data
df = pd.read_csv(path+'data/raw/'+master_csv)
# rename and drop some columns
df = df.rename(columns={"Participant ID": "Person1", "Partner ID": "Person2", "Conversation type ": "ConvoType", "Group ID": "GroupID"}).drop(df.columns[6:],axis=1).drop(df.columns[0],axis=1)

In [4]:
# drop all rows without Speaker1 determination
df = df.drop(df[df.Speaker1.isna()].index)
# standardize columns 
df.Speaker1 = df.Speaker1.astype("int64")
df.ConvoType = df.ConvoType.apply(lambda x: x.lower())
df.GroupID = df.GroupID.apply(lambda x: x.lower())
# add "Text" column
df['Text'] = [None]*len(df)
# reset index
df = df.reset_index()

In [5]:
def read_file(file_path: str) -> str:
    """
    Reads a txt file in, replaces characters to standardize, and returns text.
        in : str file path (.txt)
        out: str text of the file
    """

    # open the file and read it in
    text_file = open(file_path, "r")
    text = text_file.read()
    text_file.close()
    # replace things 
    text = text.replace("Speaker 1", "S1")
    text = text.replace("Speaker 2", "S2")
    text = text.replace("\ufeff", "")
    return text

def add_text(dataframe, convotype, folderpath):
    """
    Takes in the master dataframe, and a name of conversation and folder to read data from. 
    Adds the text of files to the dataframe in appropriate places (inplace). 
    As the result, the dataframe will have the "Text" column filled out for that conversation type. 
        in : 
            pd.DataFrame
            str name of the type of conversations
            str name of the folderpath where the conversations of this type are (e.g. STUDYCODE/Foldername)
        out: nothing
    """

    # for each row of the dataframe
    for index, row in dataframe.iterrows():
        # if the row's ConvoType is the same as the current convotype
        if row.ConvoType.startswith(convotype):
            # read in the txt file corresponding to the row
            try:
                file_path = f'{path}data/raw/transcripts/{folderpath}/{STUDY_CODE}_{row.Person1}_{row.Person2}_{row.ConvoType}_{row.GroupID}.txt'
                text = read_file(file_path) 
            except:
                file_path = f'{path}data/raw/transcripts/{folderpath}/{STUDY_CODE}_{row.Person2}_{row.Person1}_{row.ConvoType}_{row.GroupID}.txt'
                text = read_file(file_path) 
            # add the text to the "Text" column in the dataframe 
            dataframe.loc[index, "Text"] = text

def preproc(text: str) -> list:
    """
    Preprocess text from txt files by separating it into utterances (speaker lines)and standardizing "no one's" lines.
    in : str some text
    out: list of strings each of which is a line spoken by a person
    """

    # make the text into a list of lines by splitting on new line (\n)
    text = text.split("\n\n\n") if len(text.split("\n\n\n")) > 1 else text.split("\n\n")
    # for each line
    to_remove = []
    for i in range(len(text)):
        line = text[i]
        # if line's speaker is not S1 AND not S2 
        if line[10:12] != 'S1' and line[10:12] != 'S2':
            # combine that line with the previous line and add "[both]" to separate them 
            text[i-1] = text[i-1] + " [both] " + line
            # add that line to a list of lines to be removed
            to_remove.append(i)
    # remove the lines that need to be removed
    for item in sorted(to_remove, reverse=True):
        text.pop(item)
    return text

In [6]:
# change the names (first=convotype,second=foldername)
add_text(df,"intro","Introduction")
add_text(df,"nego1","Negotiation1")
add_text(df,"nego2","Negotiation2")
add_text(df,"nego3","Negotiation3")

In [7]:
# this cell creates an utterances_df dataframe that will then be converted into a ConvokitCorpus
# initialize a temp dataframe
colsnewdf = ['speaker', 'conversation_id', 'text', 'timestamp', 'start']
temp_df = pd.DataFrame(columns = colsnewdf)
# initialize lists for debugging
length, conversation_ids = [], []
# set paramenter n = minimum number of lines a conversation has to have to be included in the corpus
# (just keep it 0)
n = 0

for i in range(len(df)):
    # preproces text from the i'th row of the master dataframe to get a list of lines
    listoflines = preproc(df.loc[i,"Text"])
    length.append(len(listoflines))
    # get conversation id for the conversation consisting of ConvoType and GroupID
    conversation_id = df.loc[i,"ConvoType"] + "_" + df.loc[i,"GroupID"]
    conversation_ids.append(conversation_id)
    # initialize temp list to hold utternaces and their metadata to be added to the temp_df later on
    list1 = [] 
    # if there are more than zero lines in the conversation
    if len(listoflines) > n:
      # determine the speaker ids 
      speakerA = df.loc[i, "Speaker1"]
      speakerB = df.loc[i, "Person2"] if speakerA == df.loc[i, "Person1"] else df.loc[i, "Person1"]

      # for each line in the conversation
      for j in range(len(listoflines)):
        # if it's the first line, set start variable to True to mark the begining of conversation
        start = True if j == 0 else False
        # get the current line
        line = listoflines[j]
        # if the current line was said by the first speaker (S1)
        if line[10:12] == "S1":
          # add the data about it to the temp list in the order of colsnewdf (speaker = speakerA)
          #   ['speaker', 'conversation_id', 'text', 'timestamp', 'start']
          # 'text' = characters in the line starting after the timestamp
          # 'timestamp' = characters in the line starting before the speaker 'S1/2' and 'text'
          list1.append([speakerA, conversation_id, line[14:], line[:9], start])
        if line[10:12] == "S2":
          # add the data about it to the temp list in the order of colsnewdf (speaker = speakerB)
          list1.append([speakerB, conversation_id, line[14:], line[:9], start] )
      # turn the temp list into another temp dataframe
      df1 = pd.DataFrame(list1, columns=colsnewdf)
      # merge the temp dataframes
      temp_df = pd.concat([temp_df, df1], ignore_index=True)

# add an (utterance) id column to the temp dataframe
# utterance ids are just the index of the utterance plus one turned into a string (str)
temp_df['id'] = [str(i+1) for i in temp_df.index]
# add a 'reply_to' column to the temp dataframe 
# it's just the previous line in our case (or None if that's the first line) 
reply_to = [str(i) for i in temp_df.index]
temp_df['reply_to'] = np.where(temp_df.start, None, reply_to)
# turn the temp dataframe into the final utterances_df by dropping the "start" column
utterances_df = temp_df.drop(["start"],1)

  utterances_df = temp_df.drop(["start"],1)


In [8]:
# turn the utterances_df dataframe into a convokit corpus
corpus=Corpus.from_pandas(utterances_df)
# this ensures that each utteraces has a speaker (object) associated with it (instead of just speaker id)
for utt in corpus.iter_utterances():
    utt.speaker = corpus.speakers[utt.speaker.id]

5539it [00:00, 25401.85it/s]


In [9]:
# print statistics for the corpus
corpus.print_summary_stats()

Number of Speakers: 73
Number of Utterances: 5539
Number of Conversations: 103


In [11]:
# SAFETY CHECKS

# you can set the number of lines to see how many conversations with that number of lines are in the corpus
num_lines = 0

# Speakers 
# if everything is fine you shouls see the same number of speakers in the csv as in the corpus
print("SPEAKERS")
original_speakers = set(df.Person1.astype(str)).union( set(df.Person2.astype(str)) )
new_speakers = set([spkr.id for spkr in corpus.iter_speakers()])
print(f"The number of speakers in the csv is {len(original_speakers)}.\nThe number of speakers in the corpus is {len(new_speakers)}.")
print(f"Speakers {new_speakers - original_speakers} are in the coprus but not the csv.")
print(f"Speakers {original_speakers - new_speakers} are in the csv but not the corpus.")

# Convos
# if everything is fine you shouls see the same number of conversations in the csv as in the corpus
print("\nCONVOS")
n_liners = [ i for i in range(len(length)) if length[i] == num_lines ]
print(f"There are {len(df)} conversations in the csv.\nThere are {len(corpus.conversations)} in the corpus.")
print(f"Conversations with ids {n_liners} have {num_lines} lines.")
for convo in corpus.iter_conversations():
    if len(convo._utterance_ids) == num_lines:
        print(convo.id)
print(f"The conversations with {num_lines} lines are:")
df.iloc[n_liners] if len(n_liners) > 0 else print("None")

SPEAKERS
The number of speakers in the csv is 73.
The number of speakers in the corpus is 73.
Speakers set() are in the coprus but not the csv.
Speakers set() are in the csv but not the corpus.

CONVOS
There are 103 conversations in the csv.
There are 103 in the corpus.
Conversations with ids [] have 0 lines.
The conversations with 0 lines are:
None


In [20]:
# Might want to clean the text: 
#   uncomment the code below to apply the built-in convokit cleaner and fix contractions ("can't" -> "can not") 

#cleaner = TextCleaner()
#corpus = cleaner.transform(corpus) 
#import contractions
#for utt in corpus.iter_utterances():
#    tokens = utt.text.split()
#    text = [contractions.fix(word) for word in tokens]
#    utt.text = " ".join(text)

In [12]:
corpus.dump(name="corpus", base_path=path+'data/processed/')