## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import glob

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [1]:
data_path = 'gdrive/My Drive/MIDS/NLP_w266/w266_Final_Project/Data'
project_path = 'gdrive/My Drive/MIDS/NLP_w266/w266_Final_Project'

## Data Processing

In [None]:
# Pre-process

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|\d)[.]"
suffixes = "(www|WWW|Inc|Ltd|Jr|Sr|Co)"
starters = "(WWW|www|Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|\d)"

def preprocess(text):
    text = " " + str(text) + "  "
    text = text.replace("\n"," ")
    # contraction 
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)

    # Numerical
    text = re.sub(r"[0-9]+([.][0-9]+)", 'DECIMAL', text)
    text = re.sub(r"[0-9]+", 'INTEGER', text)

    # Currency
    text = re.sub(r'\$', 'USD ', text)
    text = re.sub(r'\%', ' PERCENTSIGN', text)

    # to make it easy to split sentence. 
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","PhD")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")

    return text



In [None]:
def split_into_sentences(text):    
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences if '<' not in s and '(' not in s]
    return sentences

In [None]:
## creating a text file with one sentence each and separator inbetween transcripts 
def sentence_text_file_construct (transcript, scope = 'PR'):

    sentence_list = []

    # select only relevant component type
    if scope == 'PR':
        df = transcript[transcript.transcriptcomponenttypeid.isin([2,5])]
    elif scope == 'QA':
        df = transcript[transcript.transcriptcomponenttypeid.isin([3,4,6,8])]
    else: 
        df = transcript[~transcript.transcriptcomponenttypeid.isin([1,7])]

    df = df[['transcriptid', 'transcriptcomponentid', 'componentorder','componenttext']]
    ordered_df = df.groupby('transcriptid').apply(pd.DataFrame.sort_values, 'componentorder')

    for i in df.transcriptid.unique():

        trans_text = ordered_df.loc[i].componenttext.apply(lambda x: split_into_sentences(preprocess(x)))
        for t in trans_text:
            for z in t:
                sentence_list += [z]
        sentence_list += ['\n']

    return sentence_list

In [None]:
# function to write to file 
def write_to_file (list_of_text, file):
    with open (file, 'w+') as f:
        for i in list_of_text:
            f.write("%s\n" % i)

In [None]:
# Pulling everything together
input_file_s =  project_path + 'finBERT/raw_input_file.txt'
for f in glob.glob(data_path +'*.csv'):
    transcripts = pd.read_csv(f)
    try:
        sentence_list = sentence_text_file_construct (transcripts, scope = 'ALL')
        write_to_file (sentence_list, input_file_s)
    except:
        print (f)

gdrive/My Drive/big_data/transcripts_unzipped/transcripts/transcript_key_dev_link.csv


In [None]:
# with open(input_file_s, "r") as f:
#     text = [sentence_text_file_constructnext(f) for x in range (500)]


In [None]:
# write_to_file (text, project_path + 'preprocessing_test.txt')

In [None]:
with open(input_file_s, "r") as f:
    text = [(next(f)) for x in range (500)]

In [None]:
input_file_s =  '/content/drive/My Drive/w266_project/finBERT/raw_input_file.txt'
with open(input_file_s, "r") as f:
    n = sum(1 for i in f)
with open(input_file_s, "r") as f:
    w_by_s = [len(next(f).split()) for x in range(n)]
    

In [None]:
words = sum(w_by_s)
avg_words = words/len(w_by_s)
max_words = max(w_by_s)
print ('The data contains ', n, ' of sentences.')
print ('The data contains ', words, ' of words.')
print ('Average sentence length', avg_words,'.')
print ('Max sentence length', max_words,'.')

The data contains  3611161  of sentences.
The data contains  69391787  of words.
Average sentence length 19.21592169388183 .
Max sentence length 369 .


In [None]:
# Seperate file with just prepared remarks 
input_file_s =  project_path + 'finBERT/raw_input_file_pr.txt'
for f in glob.glob(data_path +'*.csv'):
    transcripts = pd.read_csv(f)
    try:
        sentence_list = sentence_text_file_construct (transcripts, scope = 'PR')
        write_to_file (sentence_list, input_file_s)
    except:
        print (f)

gdrive/My Drive/big_data/transcripts_unzipped/transcripts/transcript_key_dev_link.csv


In [None]:
input_file_s ='/content/gdrive/My Drive/w266_project/finBERT/raw_input_file_pr.txt'
with open(input_file_s, "r") as f:
    n = sum(1 for i in f)
with open(input_file_s, "r") as f:
    w_by_s = [len(next(f).split()) for x in range(n)]
    

In [None]:

words = sum(w_by_s)
avg_words = words/len(w_by_s)
max_words = max(w_by_s)
print ('The data contains ', n, ' of sentences.')
print ('The data contains ', words, ' of words.')
print ('Average sentence length', avg_words,'.')
print ('Max sentence length', max_words,'.')

The data contains  1429618  of sentences.
The data contains  30026157  of words.
Average sentence length 21.002923158494088 .
Max sentence length 369 .
