# Pre-processing a .CHA file for use in CE analysis

In [1]:
import os
import sys
import pandas as pd
from tqdm import tqdm

In [2]:
data_path = '../data'
chas_path = os.path.join(data_path, 'chas')
outputs_path = os.path.join(data_path, 'server_ready', 'corpus.csv')

In [3]:
def grab_all_files(PATH, file_type='.cha'):
    files = [
        [
            os.path.join(root, f) for f in files 
            if f.endswith(file_type) and (not f.startswith('._'))
        ] 
        for root, _, files in os.walk(PATH) 
    ]
    return sum(files, [])

# Processing all CHA files

Note: the package used here was developed by Prof. Garber. Cite via:

Garber, L. (2019). CHA file python parser. Zenodo. https://doi.org/10.5281/zenodo.3364020

In [4]:
from shared.CHAFile import *

In [5]:
all_files = grab_all_files(chas_path)
# all_files

In [6]:
data = []
for f in all_files:
    chacha = ChaFile(f)
    meta_data_pieces = f.replace('.cha', '').split('/')
    for line in chacha.getLines():
        line['conversation_id'] = meta_data_pieces[-1]
        line['overlapping_text'] = bool(re.findall(r"(⌋|⌊|⌉|⌈)", line['text']))

        if meta_data_pieces[-2] in ['eng_n', 'eng_s']:
            line['corpus'] = meta_data_pieces[-3] + '-' + meta_data_pieces[-2]
        else:
            line['corpus'] = meta_data_pieces[-2]
        data += [line.copy()]
data = pd.DataFrame(data)



In [7]:
data.head()

Unnamed: 0,document_line_no,utterance_no,speaker,text,bullet,recipient,conversation_id,overlapping_text,corpus,com,mor,gra,exp
0,17,1,M1,you don't want to be recorded ↗,"[1104, 2592]",ADULT,6269,False,callfriend-eng_s,,,,
1,18,2,M2,oh I don't care →,"[2752, 3472]",ADULT,6269,False,callfriend-eng_s,,,,
2,19,3,M1,"∙hhh (0.7) all right, so ah this is for the Un...","[3952, 7200]",ADULT,6269,False,callfriend-eng_s,,,,
3,21,4,M2,(0.4) oh ⌈I I still don't mind⌉→,"[7424, 9040]",ADULT,6269,True,callfriend-eng_s,,,,
4,22,5,M1,"⌊oh, and we've got⌋ thirty minutes to talk →","[7564, 10160]",ADULT,6269,True,callfriend-eng_s,,,,


### Correcting utterances/removing CLAN specific formatting.

In [8]:
def corrected_text(text, contraction_replacement_nonce='CCOONNTTRRAACCTTIIOONN'):
    res = re.sub(r'\(\(.*?\)\)', ' ', str(text))
    # res = re.sub(r'\[.*?\]', ' ', res)
    
    # find contractions and preserve them . . .
    contractions = list(re.findall(r"\w+'\w+", res))
    for contraction in set(contractions):
        replacement = contraction.replace("'", contraction_replacement_nonce)
        res = res.replace(contraction, replacement)
    res = re.sub(r"(⌋|⌊|⌉|⌈)", '', res)
    res = res.replace(':', '')
    
    # remove numbers in parentheses (times???)
    res = re.sub(r'\(\d\.\d\)', ' ', res)
    
    # remove all other special characters.
    res = re.sub(r'[^\w\s]', ' ', res)
    
    res = re.sub(r'\s+', ' ', res).replace('[', ' ').replace(']', ' ').replace(contraction_replacement_nonce, "'")
    return res.strip()

In [9]:
data['raw_text'] = data['text'].values
data['text'] = [corrected_text(text) for text in tqdm(data['raw_text'].values)]

100%|██████████| 79664/79664 [00:00<00:00, 129129.72it/s]


In [10]:
data[['corpus', 'raw_text', 'text']].head(n=6)

Unnamed: 0,corpus,raw_text,text
0,callfriend-eng_s,you don't want to be recorded ↗,you don't want to be recorded
1,callfriend-eng_s,oh I don't care →,oh I don't care
2,callfriend-eng_s,"∙hhh (0.7) all right, so ah this is for the Un...",hhh all right so ah this is for the University...
3,callfriend-eng_s,(0.4) oh ⌈I I still don't mind⌉→,oh I I still don't mind
4,callfriend-eng_s,"⌊oh, and we've got⌋ thirty minutes to talk →",oh and we've got thirty minutes to talk
5,callfriend-eng_s,(0.6) ☺we've thirty minutes to talk☺oh ok→,we've thirty minutes to talk oh ok


## Create juxtaposed corpus: (x,y) pairs

In [11]:
max_turns_apart = 10

In [12]:
import warnings; warnings.filterwarnings("ignore")

corpus = []
for cid in tqdm(data['conversation_id'].unique()):
    sub = data.loc[data['conversation_id'].isin([cid])]
    sub_index = sub.index.values
    
    for i in sub_index:
        if i != sub_index[-1]:
            
            # speaker vs. other
            next_line_no = ( (sub_index > i) & (~sub['speaker'].isin([sub['speaker'].loc[i]])) ).values.nonzero()[0]
            next_line_no = sub_index[next_line_no][:(max_turns_apart+1)]
            # next_line_no = next_line_no[next_line_no <= (i + max_turns_apart)]
            for j,li in enumerate(next_line_no):
                d = data.loc[i].to_dict()
                
                d['next_speaker'] = data['speaker'].loc[li]
                d['next_text'] = data['text'].loc[li]
                d['next_utterance_no'] = data['utterance_no'].loc[li]
                d['next_utterance_delta_no'] = j
                
                corpus += [d]
            
            # speaker vs. self 
            next_line_no = ( (sub_index > i) & (sub['speaker'].isin([sub['speaker'].loc[i]])) ).values.nonzero()[0]
            next_line_no = sub_index[next_line_no][:(max_turns_apart+1)]
            # next_line_no = next_line_no[next_line_no <= (i + max_turns_apart)]
            for j,li in enumerate(next_line_no):
                d = data.loc[i].to_dict()
                
                d['next_speaker'] = data['speaker'].loc[li]
                d['next_text'] = data['text'].loc[li]
                d['next_utterance_no'] = data['utterance_no'].loc[li]
                d['next_utterance_delta_no'] = j
                
                corpus += [d]

100%|██████████| 217/217 [01:23<00:00,  2.61it/s]


In [13]:
data = pd.DataFrame(corpus)
data.head()

Unnamed: 0,document_line_no,utterance_no,speaker,text,bullet,recipient,conversation_id,overlapping_text,corpus,com,mor,gra,exp,raw_text,next_speaker,next_text,next_utterance_no,next_utterance_delta_no
0,17,1,M1,you don't want to be recorded,"[1104, 2592]",ADULT,6269,False,callfriend-eng_s,,,,,you don't want to be recorded ↗,M2,oh I don't care,2,0
1,17,1,M1,you don't want to be recorded,"[1104, 2592]",ADULT,6269,False,callfriend-eng_s,,,,,you don't want to be recorded ↗,M2,oh I I still don't mind,4,1
2,17,1,M1,you don't want to be recorded,"[1104, 2592]",ADULT,6269,False,callfriend-eng_s,,,,,you don't want to be recorded ↗,M2,we've thirty minutes to talk oh ok,6,2
3,17,1,M1,you don't want to be recorded,"[1104, 2592]",ADULT,6269,False,callfriend-eng_s,,,,,you don't want to be recorded ↗,M2,alright well,8,3
4,17,1,M1,you don't want to be recorded,"[1104, 2592]",ADULT,6269,False,callfriend-eng_s,,,,,you don't want to be recorded ↗,M2,I I know I'm I'm prattling with a pro here,10,4


In [14]:
data['self'] = data['speaker'] == data['next_speaker']
data = data.sort_values(by=['corpus', 'conversation_id', 'utterance_no', 'self', 'next_utterance_no'])
data.index = range(len(data))

In [15]:
data[['corpus', 'text', 'next_text']].isna().mean()

corpus       0.0
text         0.0
next_text    0.0
dtype: float64

In [16]:
data[['corpus', 'text', 'speaker', 'next_speaker', 'next_text', 'utterance_no', 'next_utterance_no']].head()

Unnamed: 0,corpus,text,speaker,next_speaker,next_text,utterance_no,next_utterance_no
0,callfriend-eng_n,hhh hhh hhh hhh,M2,M1,whatt they didn't say that in the thing,1,2
1,callfriend-eng_n,hhh hhh hhh hhh,M2,M1,publically distributed,1,5
2,callfriend-eng_n,hhh hhh hhh hhh,M2,M1,I didn't read that,1,7
3,callfriend-eng_n,hhh hhh hhh hhh,M2,M1,hhh well the xxx was,1,9
4,callfriend-eng_n,hhh hhh hhh hhh,M2,M1,no Janice was talking to one of her friends an...,1,11


In [17]:
data.shape

(1664299, 19)

## Save outputs for server operations.

In [18]:
data.to_csv(outputs_path, index=False, encoding='utf-8')