Yang Yu (yy5bm@virginia.edu) DS 5001 Spring 2023

# Purpose
Create F2 tables from DOCS

# Setup

In [47]:
import pandas as pd
import nltk
import numpy as np
from nltk.stem.porter import PorterStemmer

customize directory

In [119]:
data_in = '/Users/yangyu/Desktop'
data_prefix = 'DS5001'

df = pd.read_pickle(f'{data_in}/{data_prefix}/patents.pickle')
df = df[df['year_y'] == 1997].drop('year_y', axis = 1)
df.columns = ['para_str']

# Clean data

## Clean

In [121]:
df1 = df.copy()

In [122]:
# clean para_str
df1['para_str'] = df1['para_str'].str.replace(r'\n', ' ', regex=True)

df1['para_str'] = df1['para_str'].replace(r'[^a-zA-Z]+', ' ', regex=True).str.lower()

df1['para_str'] = df1['para_str'].str.strip()

df1 = df1[~df1['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

## Parse

In [76]:
#SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
 #       .stack()\
  #      .to_frame('sent_str')
#SENTS.index.names = ['doc_id','sent_num']

In [123]:
df1

Unnamed: 0_level_0,para_str
doc_id,Unnamed: 1_level_1
5597916,examples analytical data were recorded for the...
5613043,description of the preferred embodiment fig sh...
5613040,description of the preferred embodiment fig sh...
5615307,description of the preferred embodiment fig sh...
5692098,description of preferred embodiments with refe...
...,...
5648919,description of the preferred embodiments examp...
5645431,detailed description of preferred embodiments ...
5649061,detailed description of the preferred embodime...
5592774,detailed description of the preferred embodime...


## TOKENS

In [124]:
keep_whitespace = True

if keep_whitespace:
    TOKENS = df1.para_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')
else:
    TOKENS = df1.para_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

In [125]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5597916,0,"(examples, NNS)"
5597916,1,"(analytical, JJ)"
5597916,2,"(data, NNS)"
5597916,3,"(were, VBD)"
5597916,4,"(recorded, VBN)"
...,...,...
5619621,4717,"(permitted, VBN)"
5619621,4718,"(by, IN)"
5619621,4719,"(the, DT)"
5619621,4720,"(prior, JJ)"


In [126]:
TOKENS.index.names = ['doc_id','token_num']

TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
TOKENS['term_str'] = TOKENS.token_str.str.lower()

In [127]:
TOKENS = TOKENS[TOKENS['pos'].notna()] # remove characters such as '.' etc.

In [128]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,pos_tuple,pos,token_str,term_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5597916,0,"(examples, NNS)",NNS,examples,examples
5597916,1,"(analytical, JJ)",JJ,analytical,analytical
5597916,2,"(data, NNS)",NNS,data,data
5597916,3,"(were, VBD)",VBD,were,were
5597916,4,"(recorded, VBN)",VBN,recorded,recorded
...,...,...,...,...,...
5619621,4717,"(permitted, VBN)",VBN,permitted,permitted
5619621,4718,"(by, IN)",IN,by,by
5619621,4719,"(the, DT)",DT,the,the
5619621,4720,"(prior, JJ)",JJ,prior,prior


In [None]:
TOKENS.pos.unique()

## VOCAB

In [129]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['n_chars'] = VOCAB.index.str.len()

VOCAB['max_pos'] = TOKENS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

TPM = TOKENS[['term_str','pos']].value_counts().unstack()
VOCAB['n_pos'] = TPM.count(1)

# stopwords
sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english'))
sw.index.name='term_str'

if 'stop' not in VOCAB.columns:
    VOCAB = VOCAB.join(sw)
    VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

# stems
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.apply(lambda x: stemmer.stem(x.name), 1)

In [143]:
CORPUS = TOKENS.copy()

In [145]:
bag = ['doc_id']

In [152]:
def create_bow(CORPUS, bag, item_type='term_str'):
    ''' Goal: generate bag of words'''
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW


def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
    
    ''' Goal: generate TFIDF, IDF '''
    DTCM = BOW.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T # 每个word在每个bag中出现频率，TF越高，这个word在这个bag中越重要
    elif tf_method == 'log':
        TF = (np.log2(1 + DTCM.T)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.astype('bool').sum() # 每个word共出现在多少bag中
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # IDF越大，一个word越被它出现的那个bag独有
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    return TFIDF, IDF

In [156]:
BOW = create_bow(CORPUS, bag, item_type='term_str')

TFIDF, IDF = get_tfidf(BOW)

DTCM = BOW.n.unstack().fillna(0).astype('int')

# compute DF
DF = DTCM.astype('bool').sum()
    
# compute DFIDF
VOCAB['df'] = DF
VOCAB['idf'] = IDF
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf

# Save

customize

In [None]:
path = '/Users/yangyu/Desktop/DS5001/output'
df1.to_pickle(path + '/df1.pickle')

In [158]:
path = '/Users/yangyu/Desktop/DS5001/output'
TOKENS.to_pickle(path + '/TOKENS.pickle')
VOCAB.to_pickle(path + '/VOCAB.pickle')