In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import gensim
from gensim import models
from gensim import corpora
import ast

import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline

import unicodedata
from nltk.tokenize import word_tokenize
from string import punctuation
remove_terms = punctuation
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import spacy
nlp = spacy.load('en')

from sklearn.feature_extraction.text import TfidfVectorizer

### Import necessary packages, including nltk for word tokenization/sentence tokenization, spacy for lemmatization, gensim for phrase detection model,  sklearn for tfidf model.

In [None]:
df1 = pd.read_csv('df1.csv')
df1.head() # has all the necessary columns, but keyword and keyword_norm are not clean

In [None]:
df_check = pd.read_csv('df_key_0620.csv')

In [None]:
df_check.head(10)

In [None]:
len(df_check)

In [None]:
keywords = []
for i in range(500):
    a = ast.literal_eval(df_check['key_0620'][i])
    keywords = keywords + a
len(keywords)

In [None]:
from collections import Counter
dict_final = Counter(keywords)
sorted_x = sorted(dict_final.items(), key=lambda kv: kv[1], reverse=True)
sorted_x

In [None]:
df_key_v1 = pd.read_csv('df_key_v1.csv')
#This version only has 3 columns, bibcode and cleaned 'keyword' and 'keyword_norm' (nan was replaced with placeholders). 

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
f = open('reuters_wos.txt')
reu_stop = f.read().split()
stop_words.extend(x for x in reu_stop if x not in stop_words)
len(stop_words)
#Initial stopwords include nltk stopword collection and common stopwords in scientific publications by Thomson Reuters  

In [None]:
# Customized stop words for the pre-processing step
places = []
with open('pre2_stop.txt', 'r') as filehandle:  
    for line in filehandle:
        currentPlace = line[:-1]
        places.append(currentPlace)

In [None]:
stop_words.extend(x for x in places if x not in stop_words)
len(stop_words)

In [None]:
# Below are preprocessing functions.

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stop_words]    
    filtered_text = ' '.join(filtered_tokens) # re-create document from filtered tokens
    return filtered_text

def normalize_corpus(corpus, lemm = True, stopword_removal = True):
    normalized_corpus = []
    for doc in corpus:
        doc = remove_accented_chars(doc)
        doc = doc.lower()
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc) # remove extra newlines
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A) # remove special characters
        if lemm:
            doc = lemmatize_text(doc)
        if stopword_removal:
            doc = remove_stopwords(doc)
        normalized_corpus.append(doc)
    return normalized_corpus

In [None]:
df_key_v1.loc[0,'keyword'] # is a string!!

In [None]:
# make a list of all keyword and keyword_norm
keyword_list = [];
for i in range(len(df_key_v1)):
    keyword_list.append(df_key_v1.loc[i,'keyword'])
keyword_norm_list = [];
for i in range(len(df_key_v1)):
    keyword_norm_list.append(df_key_v1.loc[i,'keyword_norm'])

In [None]:
# normalize all keyword and keyword_norm; however, no need to remove stopwords
norm_keyword_all = [];
norm_keyword_norm_all = [];
for i in range(len(df_key_v1)): 
    norm_keyword_all.append(normalize_corpus(ast.literal_eval(keyword_list[i]), lemm = True, stopword_removal = False))
    norm_keyword_norm_all.append(normalize_corpus(ast.literal_eval(keyword_norm_list[i]), lemm = True, stopword_removal = False))
# 'galaxies evolution' should be the same as 'galaxy evolution'

#### Save the above cell as well. taking too long

In [None]:
key_set_1 = [];
for i in range(len(norm_keyword_all)):
    key_set_1.append(list(set(norm_keyword_all[i] +norm_keyword_norm_all[i])))   

In [None]:
df_key_v1['key_set_1'] = key_set_1

In [None]:
df_key_v1.columns
# key_set_1 is combined cleaned-up keyword and keyword-norm

In [None]:
df_key_v1.to_csv('/Users/yanfeiwang/Downloads/df_key_v1.csv', index=False)
# version control: ['bibcode', 'keyword', 'keyword_norm', 'key_set_1'] with key_set_1 being combined cleaned-up keyword and keyword-norm

In [None]:
# understanding how many unique phrases/ keywords are in key_set_1, to use them for training bigram
key_norm = []
key_norm_norm = []
for i in range(len(df_key_v1)):
     for _ in norm_keyword_all[i]:key_norm.append(_)
for i in range(len(df_key_v1)):
     for _ in norm_keyword_norm_all[i]:key_norm_norm.append(_)
print(len(key_norm), len(np.unique(key_norm)), len(key_norm_norm), len(np.unique(key_norm_norm)))                 

### Moving on to titles!

In [None]:
title_list = [];
for i in range(len(df1['title'])):
    title_list.append(ast.literal_eval(df1.loc[i,'title'])[0])
len(title_list)

In [None]:
# pre-processing of titles
norm_title_all = normalize_corpus(title_list, lemm = True, stopword_removal = True)
title2word = [[text for text in doc.split()] for doc in norm_title_all]

In [None]:
df_key_v1['title2word'] = title2word
df_key_v1.columns
#'title2word' is cleaned up titles in the format of list of words

Start with tfidf model, because that might give us better idea of what kind of high-frequency words there are. 
Use subsets of data to speed up the iteration.

In [None]:
abs_list = df1['abstract']
np.unique(abs_list.isna(), return_counts = True)

In [None]:
remove_terms = punctuation #+ '0123456789'#
remove_terms
# for now, leave the numbers in the texts, because they might be object names, eg, ngc 4343. 
# But in the end, remove them because numbers alone are not keywords

In [None]:
'''This function serves to parse paragraph data: for abstracts and full-texts, it's useful to still keep the sentence 
structure during tokenization, such as in the case of bi-gram training''' 
def tokenize_para (para):
    sent = sent_tokenizer.tokenize(para) # string to sentences, return a list of sentences;
    sent_word = []; # break the sentence into words, return a list of words
    for i in range(len(sent)):
        sent_word.append(word_tokenize(sent[i])); # a list of lists of words
    sent_filt1 = [[word for word in sent if word not in remove_terms] for sent in sent_word]
    sent_filt1 = [' '.join(tok_sent) for tok_sent in sent_filt1] # a list of full sentences (each sentence is a string)
    norm_sent_filt1 = normalize_corpus(sent_filt1) # return the same as above, but after normalization
    norm_00 = [tok_sent for tok_sent in norm_sent_filt1 if len(tok_sent.split()) > 3]
    texts = [[text for text in doc.split()] for doc in norm_sent_filt1]
    para_styles = {'sent': norm_sent_filt1,   # a list of full sentences (each sentence is a string)
                  'word': texts}  # a list of list of words
    return para_styles

In [None]:
# 1000 abstracts, for tfidf model, for the purpose of getting more stop_words
abs_sents = [];
for i in range(1000):
    try:
        abs_sents.append(tokenize_para(abs_list[i])['sent'])       
    except:
        abs_sents.append(['nan'])  
len(abs_sents)

In [None]:
# Concatenate the 1000 abstracts
abs_all_sent1 = [];
abs_n_sent = [];
for i in range(len(abs_sents)):
    number_of_sentences = 0
    for j in range(len(abs_sents[i])):
        number_of_sentences = number_of_sentences + 1;
        abs_all_sent1.append(abs_sents[i][j])
    abs_n_sent.append(number_of_sentences)
print(len(abs_all_sent1), len(abs_n_sent))  

In [None]:
bodylist = df1['body'].tolist()

In [None]:
# parse out introduction
intro_list = [];
for i in range(len(bodylist)):
    try:
        try1 = bodylist[i].lower()
    except:
        try1 = ['nan']
    try:
        split1 = try1.split('introduction')
        try:
            split2 = split1[1].split('observations', 1)
        except:
            intro = ['nan']
    except:
        intro = ['nan']
    try:
        intro = split2[0]
    except:
        intro = ['nan']
    intro_list.append(intro)
len(intro_list)

In [None]:
# parse out conclusion
conc = [];
for i in range(len(bodylist)):
    try:
        try1 = bodylist[i].lower()
    except:
        try1 = ['nan']
    try:
        split1 = try1.split('conclusions')
        if 'thank' in split[1]:
            split2 = split1[1].split('thank')
        elif 'acknowledge' in split[1]:
            split2 = split1[1].split('acknowledge')
        else:
            try:
                split2 = split1[1].split('references')
            except:
                split2 = ['nan']
    except:
        intro = ['nan']
    try:
        intro = split2[0]
    except:
        intro = ['nan']
    conc.append(intro)
len(conc)

In [None]:
intro_conc = [];
for i in range(len(bodylist)):
    intro_conc.append(intro_list[i]+conc[i])
len(intro_conc)

In [None]:
norm_body_all = normalize_corpus(intro_conc, lemm = True, stopword_removal = True)
#body2word = [[text for text in doc.split()] for doc in norm_body_all]
len(norm_body_all)

In [None]:
type(norm_body_all[16407])

In [None]:
len(norm_body_all)

In [None]:
df_key_v1['norm_body'] = norm_body_all

In [None]:
df_key_v1.columns
# 'norm_body_all' is cleaned-up introduction+ conclusion from maintext, as one string

In [None]:
norm_body_5 = [];
for i in range(5):
    norm_body_5.append(norm_body_all[i])
norm_body_5

In [None]:
#TFIDF model to get important words
tvec3 = TfidfVectorizer(min_df=0.005, max_df=0.2)

X3 = tvec3.fit_transform(norm_body_5)
abs_dict_list = [];
weights3 = [];
for i in range(X3.shape[0]):
    
    weights3_0 = np.squeeze(X3[i].toarray())
    df3 = pd.DataFrame({'term': tvec3.get_feature_names(), 'frequency': weights3_0})
    tfidf_dict = df3.sort_values(by='frequency', ascending=False).head(40).to_dict()
    abs_dict_list.append(tfidf_dict)

In [None]:
df4plot = pd.DataFrame.from_dict(abs_dict_list[0])
df4p1 = df4plot.sort_values(by='frequency', ascending=False).head(25)
plt.xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
plt.barh(df4p1.term, df4p1.frequency, align='center', color='#3F5D7D')

### - Iterate through preprocessing and tfidf to construct two vocabularies of stop-words
### - Iterate the above for more text (publication 1000-2000, 2000-3000, etc)
### - Partition the high-frequency words into the two vocabularies of stop-words

In [None]:
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
sentences = Text8Corpus(datapath('testcorpus.txt'))

In [None]:
phrases = Phrases(sentences, min_count=1, threshold=1)
print(phrases[title2word[0]])
# random file 'sentences' do not help!

In [None]:
total_key_norm = df_key_v1['key_set_1']

In [None]:
len(total_key_norm)

In [None]:
import ast
key_norm = [];
for i in range(len(total_key_norm)):
    key_norm.append(ast.literal_eval(total_key_norm[i]))

In [None]:
len(key_norm)

In [None]:
key_norm_all = []
for i in range(len(key_norm)):
    key_norm_all.extend(x for x in key_norm[i] if x not in key_norm_all)
len(key_norm_all)

In [None]:
word4bigram = [word_tokenize(key) for key in key_norm_all]

In [None]:
# tokenize and clean all sentences, to prepare for building bigram model on entire list of maintext
## run on google colab
body_sents_words = [];
for i in range(len(norm_body_all)):
    try:
        body_sents_words.append(tokenize_para(norm_body_all[i])['word'])       
    except:
        body_sents_words.append([['nan']]) 
len(body_sents_words)

In [None]:
body_sents_words[0]

In [None]:
# combine all list of list of words for full texts
all_full_texts = [];    
for i in range(len(body_sents_words)):
    all_full_texts = all_full_texts + body_sents_words[i]
len(all_full_texts)

In [None]:
all_full_texts = [];    
for i in range(1000):
    all_full_texts = all_full_texts + body_sents_words[i]
len(all_full_texts)

In [None]:
# tokenize and clean all sentences, to prepare for building bigram model on entire list of abstracts
## takes too long to run on local computer
## run on google colab
sents = [];
for i in range(len(abs_list)):
    try:
        sents.append(tokenize_para(abs_list[i])['word'])       
    except:
        sents.append([['nan']]) 
len(sents)

In [None]:
# combine all list of list of words for abstracts
all_abstracts = [];    
for i in range(len(sents)):
    all_abstracts = all_abstracts + sents[i]
len(all_abstracts)

In [None]:
df_key_v1['abs_words'] = sents
#'abs_words' is tokenized, cleaned abstract , a list of list of words

In [None]:
sents = [];
for i in range(1000,2000):
    try:
        sents.append(tokenize_para(abs_list[i])['word'])       
    except:
        sents.append([['nan']]) 
len(sents)

In [None]:
# combine all list of list of words for abstracts
all_abstracts = [];    
for i in range(len(sents)):
    all_abstracts = all_abstracts + sents[i]
len(all_abstracts)

In [None]:
# train a bigram model on all titles, keywords, keyword_norms, all abstracts, 1000 full papers
bigram2 = gensim.models.phrases.Phrases(title2word+word4bigram+all_abstracts+all_full_texts, min_count=1, threshold=0.005) 
for i in range(100):
    print(bigram2[all_abstracts[i]])

In [None]:
for i in range(100):
    print(bigram1[title2word[i]])

In [None]:
for i in range(100):
    print(bigram2[title2word[i]])

In [None]:
title_key_19 = []
for i in range(len(title2word)):
    title_key_19.append(bigram1[title2word[i]])

In [None]:
# train a bigram model on all titles, keywords, keyword_norms, all abstracts, 1000 full papers
bigram1 = gensim.models.phrases.Phrases(title2word+word4bigram+all_abstracts+all_full_texts, min_count=1, threshold=1) 
for i in range(10):
    print(bigram1[all_abstracts[i]])

In [None]:
# train a trigram model
trigram1 = Phrases(bigram1[n_gram_vocab], min_count=1, threshold=1)

In [None]:
for i in range(100):
    trigrams_ = [t for t in trigram1[bigram1[all_abstracts[i]]]if t.count('_')==2]
    print(trigrams_)

- #### Training with the ensemble of all titles helps: was able to pick up zeta_ophiuchi, galactic_halo
- #### Moving forward, using bigram1 model to fit bigrams on everything
- #### batch-wise fitting
- #### Trigram did not generate more useful keywords

In [None]:
# abs_list to extract keywords
abs_list=['nan' if x is np.nan else x for x in abs_list]
norm_abs_all = normalize_corpus(abs_list, lemm = True, stopword_removal = True)
len(norm_abs_all)

In [None]:
df_key_v1['norm_abs'] = norm_abs_all

In [None]:
# bigram model on entire list of abstracts
abs_2gram =[]
for i in range(len(sents)):
    abs_2gram.append([]);
    for j in range(len(sents[i])):
        abs_2gram[i].append(bigram1[sents[i][j]])
len(abs_2gram)

In [None]:
### replace the space between 2 words that appear as a bigram
### in gensim model with _
matchingwords = [];
for i in range(len(sents)):
    for j in range(len(sents[i])):
        for k in reversed(range(len(sents[i][j]))):
            try: 
                mystring = sents[i][j][k]+'_'+sents[i][j][k+1]  
                if mystring in abs_2gram[i][j]:
                    mytuple = (sents[i][j][k], sents[i][j][k+1])
                    sents[i][j][k] = '_'.join(mytuple)
                    sents[i][j].remove(sents[i][j][k+1])
                else:
                    sents[i][j][k] = sents[i][j][k]
            except:
                sents[i][j][k] = sents[i][j][k]
            sents[i][j]
    matchingwords.append(sents[i])

In [None]:
len(matchingwords)

In [None]:
' '.join(matchingwords[1][0])

In [None]:
#abs_sent_list is list of all abstracts( in the format of sentences as individual strings after bi-gram replacement)
abs_list = [];
for i in range(len(sents)):
    abs_sent_list = [];
    for j in range(len(matchingwords[i])):
        
        text_2gram = ' '.join(matchingwords[i][j])
        abs_sent_list.append(text_2gram)
    abs_list.append(abs_sent_list)
print(len(abs_list))  

In [None]:
abs_para = [];
for i in range(len(abs_list)):
    abs_para_2 = '';
    for j in range(len(abs_list[i])):
        abs_para_2 = abs_para_2 + abs_list[i][j]
    abs_para.append(abs_para_2)
len(abs_para)

In [None]:
matchingwords[0][0]

In [None]:
df_key_v1['abs_bi_para'] = abs_para
#'abs_bi_para'is abstract as one string after bi-gram replacement

In [None]:
#now do tfidf on abs_para
tvec3 = TfidfVectorizer(min_df=0.005, max_df=0.2)
X19 = tvec3.fit_transform(abs_para)

abs_dict_list = [];
weights19 = [];
for i in range(X19.shape[0]):
    
    weights19_0 = np.squeeze(X19[i].toarray())
    df19 = pd.DataFrame({'term': tvec3.get_feature_names(), 'frequency': weights19_0})
    tfidf_dict = df19.sort_values(by='frequency', ascending=False).head(100).to_dict()
    abs_dict_list.append(tfidf_dict)

In [None]:
X19.shape

In [None]:
abs_dict_list[1]

In [None]:
df4plot = pd.DataFrame.from_dict(abs_dict_list[1])
df4p1 = df4plot.sort_values(by='frequency', ascending=False).head(26)
plt.figure(figsize=(12, 9))
plt.xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
plt.barh(df4p1.term, df4p1.frequency, align='center', color='#3F5D7D')

In [None]:
##Post-n-gram stopwords removal
places = []
with open('post2_stop.txt', 'r') as filehandle:  
    for line in filehandle:
        currentPlace = line[:-1]
        places.append(currentPlace)

stop_words.extend(x for x in places if x not in stop_words)
len(stop_words)

In [None]:
# post-phrase detection stopword removal and 
from string import punctuation
remove_terms2 = punctuation + '0123456789'

abs_list_2 = [];
for i in range(len(matchingwords)):
    abs_sent_list = [];
    for j in range(len(matchingwords[i])):
        for k in reversed(range(len(matchingwords[i][j]))):
            if (matchingwords[i][j][k] in stop_words) or (len(matchingwords[i][j][k]) < 3):
                matchingwords[i][j].remove(matchingwords[i][j][k])
                # remove keywords that are one character long
            else:
                str2list = list(matchingwords[i][j][k])
                matches = [x for x in str2list if x in remove_terms2]
                if len(matches) == len(str2list):
                    matchingwords[i][j].remove(matchingwords[i][j][k])
                # break the keyword into single chars -- in order to do count -- , then remove keywords 
                # that are completely made up with numbers and punctuations
        text_2gram = ' '.join(matchingwords[i][j])
        abs_sent_list.append(text_2gram)
    abs_list_2.append(abs_sent_list)
print(len(abs_list_2))  

In [None]:
#abs_para_2 is a list of all abstracts, each as a string, after phrase extraction and more stopwords removal
abs_para_2 = [];
for i in range(len(abs_list_2)):
    abs_para_ = '';
    for j in range(len(abs_list_2[i])):
        abs_para_ = abs_para_ + abs_list_2[i][j]
    abs_para_2.append(abs_para_)
len(abs_para_2)

In [None]:
#now do tfidf on abs_para after post-n-gram stopwords removal
tvec3 = TfidfVectorizer(min_df=0.0005, max_df=0.2)
X3 = tvec3.fit_transform(abs_para_2)

abs_dict_list2 = [];
for i in range(X3.shape[0]):
    weights19_0 = np.squeeze(X3[i].toarray())
    df3 = pd.DataFrame({'term': tvec3.get_feature_names(), 'frequency': weights19_0})
    tfidf_dict = df3.sort_values(by='frequency', ascending=False).head(100).to_dict()
    abs_dict_list2.append(tfidf_dict)

In [None]:
abs_dict_list2[2]

In [None]:
df4plot = pd.DataFrame.from_dict(abs_dict_list2[2])

In [None]:
df4p1 = df4plot.sort_values(by='frequency', ascending=False).head(18)
plt.figure(figsize=(12, 9))
plt.xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
plt.barh(df4p1.term, df4p1.frequency, align='center', color='#3F5D7D')

In [None]:
df_key_v1['abs_tfidf'] = abs_dict_list2
#'abs_tfidf' is output of tfidf model on abstracts , after bi-gram replacement

In [None]:
df_key_v1.columns

In [None]:
key_4_abs = []
for i in range(len(abs_dict_list)):
    key_4_abs_ = [v for v in abs_dict_list2[i]['term'].values()]
    key_4_abs.append(key_4_abs_)

In [None]:
len(key_4_abs)

In [None]:
title_key_old = [ast.literal_eval(x) for x in df_key_v1['key_set_1']]
len(title_key_old)

In [None]:
for i in range(10):
    print(len(title_key_old[i]))

In [None]:
for i in range(len(title_key_old)):
    title_key_old[i].extend(x for x in title_key_19[i] if x not in title_key_old[i])   
for i in range(10):
    print(len(title_key_old[i]))

In [None]:
for i in range(len(key_4_abs)):
    title_key_old[i].extend(x for x in key_4_abs[i] if x not in title_key_old[i])   
for i in range(10):
    print(len(title_key_old[i]))

In [None]:
for _ in range(len(title_key_old)):
    for j in reversed(range(len(title_key_old[_]))):
        try: 
            title_key_old[_][j] = title_key_old[_][j].replace("_", " ")
        except:
            title_key_old[_][j] = title_key_old[_][j]
            
len(title_key_old)      

In [None]:
for i in range(len(title_key_old)):
    for j in reversed(range(len(title_key_old[i]))):
        title_key_old[i]=[x for x in title_key_old[i] if x not in stop_words]

In [None]:
len(stop_words)

In [None]:
for i in range(10):
    print(len(title_key_old[i]))

In [None]:
title_key_old[4]

In [None]:
df_key_v1['key_0629'] = title_key_old
df_key_v1.columns

In [None]:
%store title_key_old

In [None]:
df2.columns

In [None]:
df_key_0629 = df_key_v1[['bibcode','key_0629']]
df_key_0629.to_csv('/Users/yanfeiwang/Downloads/df_key_0629.csv', index=False)