# Taking .docx file and making them remotely useful

In [None]:
import os
import sys

# path where the transcripts are stored 
#   (make this prior to running script and add all .docx formatted 
#    transcripts there.)
TRANSCRIPTS_PATH = '../data/transcripts'

# path where the corrected .csv formatted data will reside
CORRECTED_TRANSCRIPTS_PATH = '../data/data'
if not os.path.exists(CORRECTED_TRANSCRIPTS_PATH):
    os.mkdir(CORRECTED_TRANSCRIPTS_PATH)

files = [os.path.join(TRANSCRIPTS_PATH, p) for p in os.listdir(TRANSCRIPTS_PATH) if (not p.startswith('._'))]

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
def corrected_text(text):
    res = re.sub(r'\(\(.*?\)\)', ' ', str(text))
    # res = re.sub(r'\[.*?\]', ' ', res)
    res = re.sub(r'\s+', ' ', res).replace('[', ' ').replace(']', ' ')
    return re.sub(r'[^\w\s]', '', res)
    
def process_(doc):
    lines = []
    current_line = {
        'speaker': None, 
        'timestamp': None, 
        'utterance': '', 
        'unedited': '', 
        'overlapping_utterance': False
    }
    for i in tqdm(doc.index):
        if doc['Speaker'].loc[i] != current_line['speaker']:
            lines += [current_line]
            current_line = {
                'speaker': doc['Speaker'].loc[i],
                'timestamp': doc['Timestamp'].loc[i],
                'utterance': corrected_text(doc['Statement'].loc[i]),
                'unedited': doc['Statement'].loc[i],
                'overlapping_utterance': bool(re.findall(r'\[.*?\]', doc['Statement'].loc[i]))
            }
        
        else:
            current_line['utterance'] += ' ' + corrected_text(doc['Statement'].loc[i])
            current_line['unedited'] += ' ' + doc['Statement'].loc[i]
            current_line['overlapping_utterance'] += bool(re.findall(r'\[.*?\]', doc['Statement'].loc[i]))
    
    return pd.DataFrame(lines[1:])

def process(doc):
    lines = []
    for i in tqdm(doc.index):
        lines += [{
            'speaker': doc['Speaker'].loc[i],
            'timestamp': doc['Timestamp'].loc[i],
            'utterance': corrected_text(doc['Statement'].loc[i]),
            'unedited': doc['Statement'].loc[i],
            'overlapping_utterance': bool(re.findall(r'\[.*?\]', doc['Statement'].loc[i]))
        }]
        
    return pd.DataFrame(lines[1:])

In [None]:
import warnings; warnings.filterwarnings('ignore')

for file_number,file in enumerate(files):
    
    print('{}: {}/{}'.format(file.split('/')[-1], file_number+1, len(files)))
    
    doc = pd.read_csv(file)
    doc['Statement'] = doc['Statement'].astype(str)
    df = process(doc)
    df['file'] = file.split('/')[-1].replace('.csv', '').replace('transcript', '')
    
    df['next_speaker'] = None
    df['next_utterance'] = None
    df['next_unedited'] = None
    df['next_overlapping_utterance'] = None
    
    df['next_speaker'].loc[df.index.values[:-1]] = df['speaker'].loc[df.index.values[1:]].to_list()
    df['next_utterance'].loc[df.index.values[:-1]] = df['utterance'].loc[df.index.values[1:]].to_list()
    df['next_unedited'].loc[df.index.values[:-1]] = df['unedited'].loc[df.index.values[1:]].to_list()
    df['next_overlapping_utterance'].loc[df.index.values[:-1]] = df['overlapping_utterance'].loc[df.index.values[1:]].to_list()
    
    print(len(df))
    print(df['speaker'].unique(), df['next_speaker'].unique())
    print('=====][=====')
    
    df.to_csv(
        os.path.join(
            CORRECTED_TRANSCRIPTS_PATH,
            file.split('/')[-1]
        ),
        index=False, 
        encoding='utf-8'
    )
    

In [None]:
df.head(n=20)

In [None]:
df['timestamp'].isna().mean()