## DS5559 - Project
## Notebook 1 - Import Corpus
#### Name: Mengyao Zhang (mz6jv), Runhao Zhao (rz6dg)

## Synopsis
Use case: import raw text, process and then save in F3 form.

## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')
%matplotlib inline

[nltk_data] Downloading package punkt to /home/mz6jv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mz6jv/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/mz6jv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /home/mz6jv/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mz6jv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Process

#### Define OHCO 

In [None]:
# OHCO for our corpus
# Since there are multiple books, we added book_num.
OHCO = ["book_num",'chap_num', 'para_num', 'sent_num', 'token_num']

#### Define functions

In [2]:
# function to convert text to tokens
def text_to_tokens(src_file,
                   body_start=0, 
                   body_end=-1,
                   book_pat =r'^\s*ClassicBook.*$',  
                   chap_pat=r'^\s*Chapter.*$', 
                   para_pat=r'\n\n+', 
                   sent_pat=r'([.;?!"“”]+)', 
                   token_pat=r'([\W_]+)'):

    # text to lines
    lines = open(src_file, 'r', encoding='utf-8').readlines()
    lines = lines[body_start - 1 : body_end + 1]
    df = pd.DataFrame({'line_str':lines})
    df.index.name = 'line_id'
    df.line_str = df.line_str.str.replace('—', ' — ')
    df.line_str = df.line_str.str.replace('-', ' - ')
    del(lines)
    
    # lines to books
    mask = df.line_str.str.match(book_pat)
    df.loc[mask, 'book_id'] = df.apply(lambda x: x.name, 1)
    df.book_id = df.book_id.ffill()
    df.book_id = df.book_id.fillna(method="bfill")
    book_ids = df.book_id.unique().tolist()
    df['book_num'] = df.book_id.apply(lambda x: book_ids.index(x)+1)

    mask = df.line_str.str.match(chap_pat)
    df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill()
    df.chap_id = df.chap_id.fillna(method="bfill")
    chap_ids = df.chap_id.unique().tolist()
    
    # books to chaps    
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x)+1)
    df.drop(["book_id","chap_id"],axis=1,inplace=True)
    books = df.groupby(['book_num',"chap_num"])\
        .apply(lambda x:''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    
    chaps = books.reset_index('chap_num', drop=True)
    chaps = chaps.set_index(books.groupby(level=0).cumcount().rename('chap_num'), append=True)
    del(df)

    # chapters to paragraphs
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = OHCO[:3] #['chap_num', 'para_num']
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\'', ' ')
    paras.para_str = paras.para_str.str.replace(r'_', ' ')
#     paras.para_str = paras.para_str.str.replace('é', 'e')
#     paras.para_str = paras.para_str.str.replace('à', 'a')
#     paras.para_str = paras.para_str.str.replace('è', 'e')
#     paras.para_str = paras.para_str.str.replace('ù', 'u')
#     paras.para_str = paras.para_str.str.replace('â', 'a')
#     paras.para_str = paras.para_str.str.replace('ê', 'e')
#     paras.para_str = paras.para_str.str.replace('î', 'i')
#     paras.para_str = paras.para_str.str.replace('ô', 'o')
#     paras.para_str = paras.para_str.str.replace('û', 'u')
#     paras.para_str = paras.para_str.str.replace('ç', 'c')
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    del(chaps)
    
    # paragraphs to sentences
    sents = paras.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = OHCO[:4]
    del(paras)
    
    # sentences to tokens
    tokens = sents.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    tokens.index.names = OHCO #['chap_num', 'para_num', 'sent_num', 'token_num']
    tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
    tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
    tokens = tokens.drop('pos_tuple', 1)
    del(sents)
    
   

    # Tag punction
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'\d').astype('int')
    
    # Extract vocab
    WORDS = (tokens.punc == 0) & (tokens.num == 0)
    tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()
    vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
        .reset_index()\
        .rename(columns={'index':'term_str', 'term_str':'n'})
    vocab = vocab.sort_values('term_str').reset_index()
    vocab.index.name = 'term_id'
    vocab = vocab.drop('index', 1)
        
    # Add term_ids to tokens 
    tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
        .set_index('term_str').term_id).fillna(-1).astype('int')

    return tokens, vocab

def get_docs(tokens, div_names, doc_str = 'term_id', sep='', flatten=False, 
             index_only=False):
    
    if not index_only:
        docs = tokens.groupby(div_names)[doc_str]\
          .apply(lambda x: x.str.cat(sep=sep))
        docs.columns = ['doc_content']
    else:
        docs = tokens.groupby(div_names)[doc_str].apply(lambda x: x.tolist())

    if flatten:
        docs = docs.reset_index().drop(div_names, 1)
    
    return docs

def get_term_id(vocab, term_str):
    return vocab[vocab.term_str == term_str].index[0]

def get_term_str(vocab, term_id):
    return vocab.loc[term_id].term_str

#### Read in source file and apply functions

In [3]:
src_file = './combined_again.txt'

In [None]:
# configuration
cfg = dict(
    src_file = src_file,
    body_start = 3,
    body_end = 642122
)

In [None]:
# apply the text_to_tokens() function to get token and vocabulary tables
K,V= text_to_tokens(**cfg)

#### Check corpus size

In [11]:
# check number of chapters
len(K.reset_index(level=[0,1,2,3]).groupby(["book_num","chap_num"]))

1622

In [9]:
# check number of paragraphs
len(K.reset_index(level=[0,1,2,3]).groupby(["book_num","chap_num","para_num"]))

109472

In [10]:
# check number of sentences
len(K.reset_index(level=[0,1,2,3]).groupby(["book_num","chap_num","para_num","sent_num"]))

316358

In [131]:
K.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str,punc,num,term_str,term_id
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
13,514,167,2,32,VBG,rising,0,0,rising,28055
13,514,167,2,33,NN,sigh,0,0,sigh,29861
13,514,167,2,34,IN,of,0,0,of,22650
13,514,167,2,35,VBG,repining,0,0,repining,27419
13,514,167,2,36,NN,mortality,0,0,mortality,21473
13,514,167,2,37,",",",",1,0,,-1
13,514,167,2,38,CC,and,0,0,and,1156
13,514,167,2,39,",",",",1,0,,-1
13,514,167,2,40,JJ,grateful,0,0,grateful,14697
13,514,167,2,41,IN,with,0,0,with,37125


In [149]:
# check number of terms in vocabulary
V.shape

(37712, 2)

### Further process vocab table

#### Add priors to vocab table

In [None]:
V['p'] = V.n / V.n.sum()

#### Add stems to vocab table

In [None]:
stemmer = nltk.stem.porter.PorterStemmer()
V['port_stem'] = V.term_str.apply(lambda x: stemmer.stem(x))

#### Add stopwords flag to vocab table

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))
sw = pd.DataFrame({'x':1}, index=stopwords)
V['stop'] = V.term_str.map(sw.x).fillna(0).astype('int')
del(sw)

## Save

In [None]:
db_file = 'project.db'

In [None]:
with sqlite3.connect(db_file) as db:
    K.to_sql('token', db, if_exists='replace', index=True)
    V.to_sql('vocab', db, if_exists='replace', index=True)

In [None]:
# END