# Taking .docx file and making them remotely useful

In [None]:
import os
import sys

# path where the transcripts are stored 
#   (make this prior to running script and add all .docx formatted 
#    transcripts there.)
TRANSCRIPTS_PATH = 'data/transcripts'

# path where the corrected .csv formatted data will reside
CORRECTED_TRANSCRIPTS_PATH = 'data/data'
if not os.path.exists(CORRECTED_TRANSCRIPTS_PATH):
    os.mkdir(CORRECTED_TRANSCRIPTS_PATH)

files = [os.path.join(TRANSCRIPTS_PATH, p) for p in os.listdir(TRANSCRIPTS_PATH) if (not p.startswith('._'))]

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from docx import Document

In [None]:
def process_(doc):
    line, document = {'speaker': '', 'text': ''}, []
    for text in list(doc.paragraphs):
        if text.text == '':
            if line['speaker']:
                document += [line]
            line = {'speaker': '', 'text': ''}
        else:
            if ('R (' in text.text) or ('L (' in text.text):
                split_text = text.text.split(':')
                line['speaker'] = split_text[0].strip()
                if len(split_text) > 1:
                    if len(split_text[1]) > 2:
                        line['text'] += split_text[1].strip()
                
            else:
                line['text'] += text.text
    
    return document

def process(doc):
    line, document = {'speaker': '', 'text': ''}, []
    for text in list(doc.paragraphs):
        # if text.text == '':
        #     if line['speaker']:
        #         document += [line]
        #     line = {'speaker': '', 'text': ''}
        # else:
        TEXT = text.text
        if ('R (' in TEXT) or ('L (' in TEXT):
            
            if '\n\n' in TEXT:
                TEXT = TEXT.split('\n\n')
                document[-1]['text'] += TEXT[0]
                TEXT = TEXT[1]
                
            TEXT = TEXT.replace('((', '(').replace('))', ')')
            
            if TEXT.endswith(':'):
                TEXT = TEXT[:-1] + ')'
            # push line to document
            document += [line]
            line= {'speaker': '', 'text': ''}
            
            # start new line
            try:
                line['speaker'] = re.findall(r'[A-Z]+ \([A-Z]\)', TEXT)[0] #split_text[0]
            except Exception:
                print(TEXT)
            
            split_text = TEXT.split(line['speaker'])
            document[-1]['text'] += split_text[0]
            
            # if len(split_text) > 1:
            #     if len(split_text[1]) > 2:
            # line['text'] += re.sub(r'[A-Z]+ \([A-Z]\)', ' ', text.text).strip()
            # line['text'] += split_text[-1].strip()
            
        else:
            #TEXT = TEXT.strip(' :)')
            line['text'] += TEXT
    
    return document

In [None]:
import warnings; warnings.filterwarnings('ignore')

for file_number,file in enumerate(files):
    
    print('{}: {}/{}'.format(file.split('/')[-1], file_number+1, len(files)))
    
    doc = process(Document(file))
    df = pd.DataFrame(doc)
    df['file'] = file
    df['line_no'] = df.index
    df['timestamp'] = df['text'].apply(lambda x: re.findall(r'\d+:\d+', x))
    df['timestamp'] = [x[0] if len(x) else None for x in df['timestamp'].values]
    df = df.loc[~df['speaker'].isna()]
    df = df.loc[~df['speaker'].isin(['', ' '])]
    
    df['text'] = df['text'].apply(lambda x: re.sub(r'\d+:\d+', '', x))
    
    # created aligned document
    df[['speaker2', 'text2', 'timestamp2']] = None
    
    for i in tqdm(df.index):
        indexes = df.loc[
            ~df['speaker'].isin([df['speaker'].loc[i]])
            & (df['line_no'] > i)
        ].index
        if len(indexes):
            df['speaker2'].loc[i] = df['speaker'].loc[indexes[0]]
            df['text2'].loc[i] = df['text'].loc[indexes[0]]
            df['timestamp2'].loc[i] = df['timestamp'].loc[indexes[0]]
    
    print(len(df))
    print(df['speaker'].unique(), df['speaker2'].unique())
    print('=====][=====')
    
    df.to_csv(
        os.path.join(
            CORRECTED_TRANSCRIPTS_PATH,
            file.split('/')[-1].replace('.docx', '.csv')
        ),
        index=False, 
        encoding='utf-8'
    )
    

In [None]:
df.head(n=20)

In [None]:
df['timestamp'].isna().mean()