# ETA Project
By Yihnew Eshetu

## Create TOKEN, LIB, and VOCAB

In [1]:
import re
import os
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import plotly_express as px
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch

from glob import glob
from pathlib import Path
from scipy.linalg import norm
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist
from nltk.stem.porter import PorterStemmer

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/yte9pc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yte9pc/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### OHCO Level

In [3]:
OHCO = ['file_id', 'company', 'title', 'speaker', 'sent_num', 'token_num']
SENTS = OHCO[:5]
SPEAKER = OHCO[:4]
TITLE = OHCO[:3]
COMPANY = OHCO[:2]
FILE = OHCO[:1]

### File Path

In [4]:
notebooks = os.getcwd()
transcript_path = notebooks + '/Transcript/'
transcript_path

'/sfs/qumulo/qhome/yte9pc/ETA Project/Transcript/'

In [5]:
def readTxt(file_list):
    lib = []
    doc = []
    count = 0
    for file_name in file_list:
        file = open(file_name, 'r', encoding='ISO-8859-1').read()
        if re.search('\sQUESTIONS\sAND\sANSWERS\s-', file, re.I) is not None:
            txt = re.sub('COMPANY\sDISCLAIMERS\sImportant(.+)', '', file, flags=re.I)
            txt = re.sub('\sQUESTIONS\sAND\sANSWERS\s-', '  QUESTIONS AND ANSWERS - ', txt, flags=re.I)
        elif re.search('\s\sQUESTIONS\sAND\sANSWERS\s', file, re.I) is not None:
            txt = re.sub('COMPANY\sDISCLAIMERS\sImportant(.+)', '', file, flags=re.I)
            txt = re.sub('\s\sQUESTIONS\sAND\sANSWERS\s', '  QUESTIONS AND ANSWERS - ', txt, flags=re.I)
        else:
            count += 1
            continue
                    
        txt = re.split('\s\sQUESTIONS\sAND\sANSWERS\s-\s', txt, re.I)
        
        title = txt[0].split('--')[0].split('\t"')[1].split('  ')
        report_title = title[0]
        date = re.search(r'[a-z]+\s(\d{2}|\d{1}),\s\d{4}', title[1], re.I).group()
        
        txt = re.sub('^------------------------------------------------------------------------------- ', '', txt[1])
        txt = re.sub('\s\s', ' -------------------------------------------------------------------------------- ', txt)
        txt = re.sub('\[([0-9]+)\]', '[1] -------------------------------------------------------------------------------- ', txt)
        txt = re.split(r' -------------------------------------------------------------------------------- ', txt)
        
        
        df = pd.DataFrame(txt, columns=['line_str'])
        
        company = file_name.split('/')[0]
        file_id = file_name.split('/')[1].split('_')[0]
        df['file_id'] = file_id
        df['company'] = company
        df['title'] = report_title
        
        speakers_lines = df.line_str.map(lambda x: str(x)[-4:].strip()).str.lower().str.match(pat = '[\[]+[0-9]+') 
        speakers = [i for i in df.loc[speakers_lines].line_str.map(lambda x: str(x).split(',')[0].split(' [')[0]).values]
        df.loc[speakers_lines, 'speaker'] = speakers
        df.speaker = df.speaker.ffill()
        df = df.loc[~df.speaker.isna()] 
        df = df.loc[~speakers_lines]
        
        # Speaker
        df['line_str'] = df['line_str'].str.strip()
        dfc = df.groupby(OHCO[:4]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
    
        # Sentence
        dfc['line_str'] = dfc['line_str'].str.strip()
        dfc = dfc[~dfc['line_str'].str.match(r'^\s*$')]
        dfs = dfc['line_str'].str.split(r'[.?!;:"]+', expand=True).stack().to_frame().rename(columns={0:'sent_str'})
        dfs.index.names = OHCO[:5]
        
        # Token
        dfs['sent_str'] = dfs['sent_str'].str.strip()
        dft = dfs['sent_str'].str.split(r"[\s',-]+", expand=True).stack().to_frame().rename(columns={0:'token_str'})
        dft.index.names = OHCO[:6]
        
        lib.append((file_id, company, date, report_title, file_name))
        doc.append(dfc)
        

    library = pd.DataFrame(lib, columns=['file_id', 'company', 'date', 'title', 'file_name'])
    docs = pd.concat(doc)
    return docs, library

### Read in Files

In [6]:
os.chdir(transcript_path)
companies = ['Cisco', 'IBM', 'Intel', 'Verizon']
list_files = [file for company in companies for file in sorted(glob(company + '/*.txt'))]
doc, lib = readTxt(list_files)

In [7]:
doc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,line_str
file_id,company,title,speaker,Unnamed: 4_level_1
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brent Bracelin,"Thank you for taking the question. Chuck, I wa..."
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brian White,I'm wondering if you could walk us through wha...
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Chuck Robbins,"Ittai, this is Chuck. Thanks for the questions..."
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Ittai Kidron,"Thanks, and congrats on great execution. First..."
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,James Faucette,"Great, thank you very much. I just had a clari..."


In [8]:
os.chdir(notebooks)
dataset_path = 'dataset/processed-files/final'
dataset_folder = Path('dataset/processed-files/final')
dataset_folder.mkdir(parents=True, exist_ok=True)

### DOCUMENT

In [9]:
doc.to_csv(dataset_folder/'DOC.csv')

### LIBRARY

In [10]:
lib['year'] = lib.date.map(lambda x: str(x).strip()).str.extract(pat = '(\d{4})')
lib = lib[['file_id', 'file_name', 'company', 'title', 'date', 'year']].set_index('file_id')
lib.to_csv(dataset_folder/'LIB.csv')

### TOKEN

In [11]:
#### Function for Tokenizing doc

In [12]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
    
    # Paragraphs to Sentences
    df = doc_df.line_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
            
    df = df.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df

In [13]:
TOKEN = tokenize(doc, ws=True)
# Remove white space
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')

### VOCAB

In [14]:
# Create VOCAB from TOKEN table
VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'
# Check if a term string is a number
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')

In [17]:
# Stem VOCAB term string
stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)
# Add term id to TOKEN table
TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)
# Add part of speech max to VOCAB table
VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
# Add stop word
stops = set(stopwords.words("english"))    
VOCAB['is_stopword'] = VOCAB['term_str'].apply(lambda x: x in stops)

In [18]:
# Remove empty term string rows in TOKEN and VOCAB table
TOKEN = TOKEN[~TOKEN.term_str.isna()]
VOCAB = VOCAB[~VOCAB.term_str.isna()]

In [19]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,pos_tuple,pos,token_str,term_str,term_id
file_id,company,title,speaker,sent_num,token_num,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brent Bracelin,0,0,"(Thank, NNP)",NNP,Thank,thank,26308
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brent Bracelin,0,1,"(you, PRP)",PRP,you,you,29413
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brent Bracelin,0,2,"(for, IN)",IN,for,for,11078
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brent Bracelin,0,3,"(taking, VBG)",VBG,taking,taking,25912
575f8b498cfe5b23768b45e8,Cisco,Q3 2016 Cisco Systems Inc Earnings Call,Brent Bracelin,0,4,"(the, DT)",DT,the,the,26334


In [20]:
VOCAB.head()

Unnamed: 0_level_0,term_str,n,num,p_stem,pos_max,is_stopword
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,28456,0,,:,False
1,0.0,46,1,0.0,CD,False
2,0.0,9,1,0.0,VB,False
3,1.0,1,1,1.0,CD,False
4,4.0,2,1,4.0,NN,False


In [21]:
TOKEN.to_csv(dataset_folder/'TOKEN.csv')
VOCAB.to_csv(dataset_folder/'VOCAB.csv')