In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import  TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn import preprocessing


import nltk
from nltk.corpus import stopwords


import warnings
warnings.filterwarnings("ignore")


# Read in dataframe of text and filenames
df=pd.read_pickle('assets/df_clean2.pkl')

In [2]:
df.head()

Unnamed: 0,text,filename,year,split
0,proceed yorkshir geolog societi vol upper jura...,Cox et al 1987,1987,test
1,triassicpalynologyofcentralandnorthwesterneuro...,Kuerschner & Herngreen 2010,2010,train
2,journalofsedimentaryresearch currentrippl doi ...,Gani 2017,2017,train
3,ˆˆˆ˙ ˛kˆ c˝ˇ hh˛ hhˇ lk˘ hhˇ hhˇ˚ hhˇ d˜˛ hhˇ ...,Iakovleva Brinkhuis & Cavagnetto 2001,2001,train
4,field excurs novemb tertiari format austin hou...,Wilson 1962,1962,test


### Corpus specific stop words

In [17]:
specific = ['figs', 'fig', 'et', 'al', 'pl','appendix','figure','cm', 'ft', 'sp'\
            , 'pp', 'iv', 'etal', 'ed', 'eds', 'http', 'ma', 'th', 'tion', 'ing',\
           'cf', 'ii', 'www', 'tions', 'strati', 'km', 'com', 'bulletin', 'doi', \
            'org', 'society','springer', 'verlag', 'pa', 'spec', 'pub', 'assoc',\
            'publication','university', 'press', 'geologists', 'geological',\
            'association', 'ph', 'comm', 'pers', 'geol', 'surv', 'bull',\
            'journal', 'soc', 'sci', 'letters', 'lett', 'geophys', 'res',\
            'acad', 'mar', 'acad', 'palaeobotany', 'palaeoclimatology', \
            'palaeogeography','societies', 'bureau', 'economic', 'prof',\
            'palaeoecology','paper', 'file', 'report', 'open', 'london',\
            'america', 'elsevier','amsterdam', 'sepm', 'earthplanet',\
           'paleoclimatol', 'palaeoecol', 'np', 'sc', 'palaeogeogr', 'palaeoclimatol',\
            'american', 'geo', 'rev', 'journal', 'und', 'review', 'samples',\
            'collected', 'allrightsreserved', 'clim', 'elsevierb', 'cosmochim',\
            'sciencereviews', 'levelchanges', 'ne', 'sepmspec', 'publ', 'acta',\
           'internationalassociationofsedimentologists', 'palaeobot', 'polynol',\
           'sedi', 'ment', 'deposi', 'tional', 'odp']


stop_words = nltk.corpus.stopwords.words('english')

### Creating the sparse matrix 

In [3]:
vectorizer = TfidfVectorizer(ngram_range = (2,2),
                             #stop_words = specific + stop_words,
                             max_features = 200
                             )
sparse = vectorizer.fit_transform(df['text'])

In [4]:
sparse.shape

(1830, 200)

In [5]:
# densifying
df_vec = pd.DataFrame(sparse.todense(), 
                  columns=vectorizer.get_feature_names())
df_vec.head()

Unnamed: 0,alberta canada,american associ,associ geolog,associ petroleum,barrier island,base level,bed form,benthic foraminifer,benthic foraminifera,bound ari,...,trough cross,unit state,univers texa,upper cretac,upper wilcox,van wagon,volcan ash,water depth,wave domin,wilcox group
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.05526,0.068325,0.305531,0.0,0.0,0.0,0.0,0.0,0.136959


In [6]:
df_vec.to_pickle('assets/df_vec.pkl')

### Cosine Similarity - dot product of normalized vector

In [7]:
cos = cosine_similarity(df_vec)
cos

array([[1.        , 0.04248589, 0.        , ..., 0.        , 0.        ,
        0.01573449],
       [0.04248589, 1.        , 0.07232913, ..., 0.        , 0.        ,
        0.04880497],
       [0.        , 0.07232913, 1.        , ..., 0.        , 0.        ,
        0.00886777],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.00286516],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.04062185],
       [0.01573449, 0.04880497, 0.00886777, ..., 0.00286516, 0.04062185,
        1.        ]])

### LDA using Sklearn

In [14]:
num_topics = 8

lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=50,
                                verbose=1, evaluate_every=10,
                                learning_method='online',
                                learning_offset=60.,
                                learning_decay = .5,
                                random_state=42)

In [15]:
lda.fit_transform(sparse)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50, perplexity: 301.3808
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50, perplexity: 300.7195
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50, perplexity: 300.4452
iteration: 31 of max_iter: 50
iteration: 32

array([[0.03753319, 0.03747185, 0.03747173, ..., 0.0375102 , 0.73749488,
        0.03753212],
       [0.04465579, 0.04465606, 0.0446558 , ..., 0.04465896, 0.25636096,
        0.04469977],
       [0.03836167, 0.03835891, 0.03854356, ..., 0.0383957 , 0.03836012,
        0.03836795],
       ...,
       [0.03188351, 0.77628236, 0.03198317, ..., 0.03199058, 0.03195093,
        0.03210358],
       [0.05553317, 0.46349631, 0.05538076, ..., 0.05539675, 0.20403734,
        0.05538752],
       [0.0281114 , 0.02804955, 0.02805816, ..., 0.02833781, 0.80321044,
        0.02811703]])

In [16]:
print("Log Likelihood: ", lda.score(sparse)) #higher the better
print("Perplexity: ", lda.perplexity(sparse)) #Lower the better. Perplexity = exp(-1. * log-likelihood per word)

#perplexity might not be the best measure to evaluate topic models because it doesn’t consider the context and semantic associations between words.

Log Likelihood:  -25056.920617093936
Perplexity:  300.16708960714647


### GridSearch LDA using Sklearn

In [11]:
search_params = {
    'n_topics': [2,3,4,5], 
    'learning_decay': [.5,.6, .7, .8], 
    'learning_offset': [60.,70., 80.]
}

In [12]:
lda_grid = LatentDirichletAllocation(max_iter=10, learning_method='online',random_state=42)

grid_model = GridSearchCV(lda_grid, param_grid=search_params)
grid_model.fit(sparse)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_topics': [2, 3, 4, 5], 'learning_decay': [0.5, 0.6, 0.7, 0.8], 'learning_offset': [60.0, 70.0, 80.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
print(grid_model.best_score_)
print(grid_model.best_params_)

-7973.164979311252
{'learning_decay': 0.5, 'learning_offset': 60.0, 'n_topics': 2}
