<a href="https://colab.research.google.com/github/varunhari2020/CourseProject/blob/main/CausalTopicModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 5.7MB/s 
Collecting funcy
  Downloading https://files.pythonhosted.org/packages/66/89/479de0afbbfb98d1c4b887936808764627300208bb771fcd823403645a36/funcy-1.15-py2.py3-none-any.whl
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97712 sha256=ec1fe1fecb490141c5e93e9d7311390f615354f6dea6cc9402ee8e8f67b2187d
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.15 pyLDAvis-2.1.2


In [4]:
import re
import numpy as np
import numpy.linalg as la
import pandas as pd
from pprint import pprint
import datetime

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import ngrams

# Gensim
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases # TODO: to create bigrams with

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt 

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['mr', 'ms', 'said'])

def tokenize(content, lemmatize=False):
    words = gensim.utils.simple_preprocess(content, deacc=True)  # tokenizes
    return words

def remove_stopwords(content):
    words = []
    for word in content:
        if word in stop_words:
            continue
        words.append(word)
    return words

rows = []
dates = []
articles = []
for month in range(5, 11):
    with open("/content/drive/Data/NYTimes/"+ str(month) + ".txt") as f:
        for i, line in enumerate(f):
            date, article = line.split(",", 1)
            timestamp = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")
            tokenized = tokenize(article)
            destopped = remove_stopwords(tokenized)

            articles.append(destopped)
            dates.append(timestamp)
            rows.append([timestamp, destopped])

df = pd.DataFrame(rows, columns=["date", "content"]) 
unique_dates = sorted(list(set(df["date"])))
print (unique_dates)
df

FileNotFoundError: ignored

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(articles)

# Attempt at filtering out words that appear too frequently
# id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# id2word.filter_extremes(no_above=0.5)


# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in articles]


tfidf = models.TfidfModel(corpus)
corpus = tfidf[corpus]


# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ago', 0.07712049873418031),
  ('awesome', 0.23220574510227418),
  ('backup', 0.2198823985449398),
  ('backups', 0.2515408271170864),
  ('bases', 0.19264069440348208),
  ('bellinger', 0.27548950382241366),
  ('bench', 0.1896343958919212),
  ('bush', 0.007894722475376273),
  ('came', 0.08612993720379283),
  ('catcher', 0.26148042600790294),
  ('clay', 0.2135830725972484),
  ('games', 0.1562902360625982),
  ('girardi', 0.27548950382241366),
  ('homer', 0.21658937110880933),
  ('jim', 0.1245222966630543),
  ('joe', 0.1146922085996351),
  ('leyritz', 0.27548950382241366),
  ('speed', 0.17969479700110466),
  ('stole', 0.20587332073042908),
  ('strength', 0.13402729061444735),
  ('turner', 0.2108175460504276),
  ('two', 0.04788545375938528),
  ('versatility', 0.27548950382241366),
  ('whose', 0.0887458288821732),
  ('yankee', 0.20825706839694694),
  ('yankees', 0.19264069440348208),
  ('years', 0.05159983565074285)]]

In [None]:
k = 10

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=k, 
                                           minimum_phi_value=0.5, # min threshold for word probabilities
                                           passes=5,
                                           alpha='auto',  # assuming that topic distribution is assymetric. Not all topics equally represented in corpus.
                                           update_every=1,
                                           per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=articles, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.335340102501334

Coherence Score:  0.498118082888875


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
for l in lda_model.get_document_topics(corpus)[:10]:
    print (l)

[(1, 0.6156847), (2, 0.01631999), (4, 0.20373307), (6, 0.06801649), (7, 0.028621836), (8, 0.0344656)]
[(1, 0.763563), (2, 0.013859917), (6, 0.033660006), (7, 0.088353015), (9, 0.068763435)]
[(1, 0.79859394), (2, 0.013141152), (6, 0.08126964), (7, 0.037291188), (9, 0.049772587)]
[(1, 0.87178653), (6, 0.029328), (7, 0.067593485)]
[(1, 0.8022314), (2, 0.010433715), (6, 0.14003201), (7, 0.01831812)]
[(1, 0.77750266), (2, 0.101369575), (6, 0.057995867), (7, 0.024414107)]
[(1, 0.84217304), (6, 0.027844958), (7, 0.06244437), (9, 0.039514713)]
[(1, 0.7187414), (2, 0.011190535), (6, 0.03845834), (7, 0.20053901)]
[(1, 0.82175153), (2, 0.058945935), (6, 0.064610116), (7, 0.022253001)]
[(0, 0.0109442), (1, 0.730404), (2, 0.018160004), (3, 0.01012647), (6, 0.169174), (7, 0.031808387)]


In [None]:
# for any given day, you look at all the diff topics and identify the prob of that topic
date_topic_prob = np.zeros((len(unique_dates), k))
for date, article in zip(df["date"], lda_model.get_document_topics(corpus)):
  i = unique_dates.index(date)
  for topic, prob in article:
    date_topic_prob[i][topic] += prob 
date_topic_prob = date_topic_prob/date_topic_prob.max(axis=0)

In [None]:
date_topic_prob

array([[0.00960165, 0.12697915, 0.01179038, ..., 0.20422308, 0.21812294,
        0.0933737 ],
       [0.01475436, 0.09657251, 0.02128822, ..., 0.07538966, 0.        ,
        0.26194383],
       [0.40982177, 0.32274864, 0.11762854, ..., 0.52242613, 0.10376247,
        0.        ],
       ...,
       [0.10482325, 0.33340277, 0.35393836, ..., 0.28412118, 0.03938651,
        0.10970976],
       [0.40246888, 0.95823687, 1.        , ..., 0.71573004, 0.        ,
        0.489494  ],
       [0.10647875, 0.49076954, 0.55658635, ..., 0.4426987 , 0.        ,
        0.31024785]])

In [None]:
# Example of norming by col
# d = np.array([
#     [1000, 10, 0.5],
#     [765, 5, 0.35],
#     [800, 7, 0.09], ])
# d/d.max(axis=0)

array([[1.   , 1.   , 1.   ],
       [0.765, 0.5  , 0.7  ],
       [0.8  , 0.7  , 0.18 ]])

In [None]:
t0 = unique_dates[4]
t0

Timestamp('2000-05-05 05:00:00+0000', tz='UTC')