In [None]:
import pandas as pd
import pickle
import re
import numpy as np

from nltk import bigrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer

In [None]:
# Enter your own path to the corpus.
df = pd.read_csv('../Innovation/Innovation-Scopus-has_abstract.tab', delimiter='\t')
df = df.drop([0, 1])
new_df = df[['Title', 'Date', 'Abstract', 'CitedBy']]
# remove instances with empty abstracts
new_df = new_df.dropna(subset=['Abstract'])
pickle.dump(new_df, open('topic-corpus.pkl', 'wb'))

In [None]:
stop_words = set(stopwords.words('english'))
regex = "(?u)\\b[\\w-]+\\b"

def tokenize(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(regex)
    tokens = tokenizer.tokenize(text)
    mwe_tokenizer = MWETokenizer([('u', 's', 'a'), ('u', 's'), ('b', 'b')], separator='')
    tokens = mwe_tokenizer.tokenize(tokens)
    tokens = [t for t in tokens if t not in stop_words and not re.match("[0-9]", t)]
    return tokens

new_df['tokens'] = tokens = new_df['Abstract'].apply(tokenize)

In [None]:
def lemmatize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(t) for t in text]

new_df['tokens'] = new_df['tokens'].apply(lemmatize)
tokens = [t for t in new_df['tokens']]

In [None]:
stopwords = ["finding", "research", "purpose", "study", "methodology", "result", 
             "analysis", "method", "paper", "literature", "tourism", "tourist", "innovation",
             "also", "within", "whereas", "would"]

def remove_stopwords(text):
    return [[token for token in doc if token not in stopwords] for doc in text]

new_df['tokens'] = new_df['tokens'].apply(remove_stopwords)
tokens = remove_stopwords(tokens)

In [None]:
#bigram_stopwords = ["result show", "descriptive statistical", "analysis cluster", "per cent",
#                    "goal paper", "presented paper", "paper present", "study examines", "study investigates"]

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents(tokens)
finder.apply_freq_filter(10)
lexicon = [i + " " + j for i, j in finder.nbest(bigram_measures.pmi, 100)]

def bgrams(text):
    for doc in text:
        doc.extend([i + " " + j for i, j in list(bigrams(doc)) if i + " " + j in lexicon])
    return text

new_df['tokens'] = new_df['tokens'].apply(bgrams)
tokens = bgrams(tokens)
pickle.dump(tokens, open('tokens.pkl', 'wb'))

### Word Frequencies

Find 5 most frequent words in the corpus. Change fdist.most_common to adjust the number of words to display.

In [None]:
from nltk.probability import FreqDist

flat_tokens = [t for doc in tokens for t in doc]

fdist = FreqDist(flat_tokens)
fdist.most_common(5)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fdist.plot(30,cumulative=False)
plt.show()

### Context of words

Find common contexts where the words from the list frequently occur.

In [None]:
from nltk import Text

text = Text(flat_tokens)
text.common_contexts(['methodology'])

Find commonly co-occurring words.

In [None]:
text.collocations()