# Final Project: Data Processing
## DS 5001
### Author: Taylor Tucker


In [175]:
import json
import pandas as pd
import nltk
import numpy as np

# SA
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# PCA
from scipy.linalg import eigh as eig

# LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

# word2vec
from gensim.models import Word2Vec

# Data

This data was collected from [Constellate](https://constellate.org). Please see the `manifest.txt` file for more information.

In [176]:
OHCO = ["text_num", "paragraph_num", "sentence_num", "token_num"]

In [177]:
data_path = "../data/raw/sample/articles.jsonl"

# Open the JSONL file and save it's lines to a list
with open(data_path, 'r') as json_file:
    json_list = list(json_file)


texts = []
text_ids = []
page_ids = []
titles = []
years = []
months = []
days = []
authors = []

for text_num, json_str in enumerate(json_list):
    result = json.loads(json_str)

    # If the JSON file has the text, the text isn't empty, and the language is English
    if "fullText" in result.keys() and result["fullText"] != [] and result["language"] == ["eng"]:
        # For each page in the text
        for page_num, text_str in enumerate(result["fullText"]):
            texts.append(result["fullText"][page_num])
            page_ids.append(page_num + 1)   # 1 indexing
            text_ids.append(text_num + 1)   # 1 indexing

            # LIB data
            titles.append(result["title"])
            authors.append(", ".join(result["creator"]))  # create string of authors

            date = result["datePublished"].split("-")
            years.append(date[0])
            months.append(date[1])
            days.append(date[2])


In [None]:
# Create dataframe from information
data = {"text_num": text_ids,
        "title": titles,
        "author": authors,
        "pub_year": years,
        "pub_mon": months,
        "pub_day": days,
        "page_num": page_ids,
        "text_str": texts,
        }

data = pd.DataFrame(data)
data.head()

In [None]:
data.shape

### Create LIB table

Using the metadata from the source files, I create the `LIB` table and save it to `LIB.csv` in the data/processed directory

In [None]:
# Remove text and page ids from data, delete duplicates.
LIB = data.copy().drop(["page_num", "text_str"], axis=1).drop_duplicates().reset_index().drop("index", axis=1)
LIB.head()

In [None]:
LIB.shape

In [None]:
LIB = LIB.sort_values("pub_year").reset_index().drop("index", axis=1)
LIB["text_num"] = [i + 1 for i in range(len(LIB["text_num"].unique()))]
LIB = LIB.set_index("text_num")
LIB.head()

In [None]:
LIB.shape

In [184]:
LIB.to_csv("../data/processed/LIB.csv")

In [None]:
data.columns

In [None]:
data_new = data.merge(LIB.reset_index()[["text_num", "title"]], on="title").sort_values(["pub_year", "page_num"]).drop("text_num_x", axis=1).rename(columns={"text_num_y":"text_num"}).reset_index().drop("index", axis=1)
data_new  = data_new[["text_num", 'title', 'author', 'pub_year', 'pub_mon', 'pub_day', 'page_num', 'text_str']]
data_new

# I: Convert to F1

In order to convert the data to F1 format, I need to reduce the data to minimum discursive units (i.e. tokens).

In [None]:
# Extract only the useful info for doc table

data = data_new.drop(["title", "author", "pub_year", "pub_mon", "pub_day"], axis=1)
data.head()

In [None]:
# Combine text across pages since pages aren't really a level of discourse
df_full_articles = data.groupby("text_num")["text_str"].apply(lambda s: "\n".join(s)).to_frame()
df_full_articles

In [189]:
DOC = LIB.merge(df_full_articles, on="text_num").reset_index()
DOC.to_csv("../data/processed/DOC.csv")

In [None]:
DOC

In [None]:
# Split now by paragraphs
df_paragraphs = df_full_articles["text_str"].str.split(r"\n\n+", expand=True).stack().to_frame().rename(columns={0:"paragraph_str"})
df_paragraphs.index.names = OHCO[:2]
df_paragraphs

In [None]:
# Clean up paragraphs

df_paragraphs["paragraph_str"] = df_paragraphs["paragraph_str"].str.replace(r'\n', ' ').str.strip() # Replace newlines
df_paragraphs = df_paragraphs[~df_paragraphs["paragraph_str"].str.match(r'^\s*$')]   # Filter whitespace paragraphs
df_paragraphs.head()

In [None]:
# Add a sentence-level splitting

# Manual Creation
# df_sentences = df_paragraphs["paragraph_str"].str.split(r'[.?!;:"]+', expand=True).stack().to_frame().rename(columns={0:"sentence_str"}) # Split on punctuation
# df_sentences.index.names = OHCO[:3] # Add sentence_num to index
# df_sentences = df_sentences[~df_sentences["sentence_str"].str.match(r'^\s*$')]  # Remove blank sentences
# df_sentences.head()

# Using NLTK Sentence tokenizer
df_sentences = df_paragraphs["paragraph_str"].apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack().to_frame().rename(columns={0:"sentence_str"})
df_sentences.index.names = OHCO[:3]
df_sentences.head()

In [None]:
# Create DOC table by tokenizing

# Manual Creation
# df_tokens = df_sentences["sentence_str"].str.split(r"[\s',-]", expand=True).stack().to_frame().rename(columns={0:"token_str"})
# df_tokens.index.names = OHCO[:4]
# df_tokens = df_tokens[~df_tokens["token_str"].str.match(r"^\s*$")]
# df_tokens.head()

# Using NLTK word tokenizer
df_tokens = df_sentences["sentence_str"].apply(lambda x: pd.Series(nltk.word_tokenize(x))).stack().to_frame().rename(columns={0:"token_str"})
df_tokens.index.names = OHCO[:4]
df_tokens.head()

In [195]:
TOKEN = df_tokens.copy()

As we can see, `df_tokens` and thus `TOKEN` contains the data broken up by minimum discursive elements (tokens).

# II: Convert the collection into F2

To convert the data to F2, I will create the `TOKEN` table by adding some NLP elements. I will also be creating a `VOCAB` table. Thus, I'll have the `TOKEN`, `DOC`, `VOCAB`, and `LIB` tables. 

## II.i Create `TOKEN` Table by adding NLP elements

In [None]:
# Tag each word and get its POS
TOKEN["pos_tuple"] = TOKEN["token_str"].apply(lambda x: tuple(nltk.pos_tag([str(x)])[0]))
TOKEN["pos"] = TOKEN["pos_tuple"].apply(lambda x: x[1])
TOKEN.head()

In [None]:
TOKEN["term_str"] = TOKEN["token_str"].str.lower().replace("[\W_]", "")
TOKEN.head()

In [198]:
# Filtering out weird mathematical characters
TOKEN = TOKEN[~TOKEN["term_str"].apply(lambda x: np.any([ord(x[i]) > 127 for i in range(len(x))]))]

In [199]:
# Filtering out weird hexcodes by filtering only for word characters
TOKEN = TOKEN[TOKEN['term_str'].str.match(r"\w+")]

In [200]:
TOKEN = TOKEN.dropna()

In [201]:
TOKEN = TOKEN[~TOKEN["term_str"].str.match(r"\d+")]

In [202]:
TOKEN.to_csv("../data/processed/TOKEN.csv")

## II.ii Create `VOCAB` table using `TOKEN` table

In [None]:
# Create VOCAB table
VOCAB = TOKEN["term_str"].value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'}).sort_index().reset_index().rename(columns={'index':'term_str'})   # Counting each instance of each token
VOCAB.index.name = 'term_id'
VOCAB.head(10)

In [None]:
# Add a column to denote if a term is a number
VOCAB['num'] = VOCAB["term_str"].str.match("\d+").astype('int') # making num col for numbers
VOCAB.head()

In [None]:
# Add a column to denote stopwords
swords = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
swords = swords.reset_index().set_index('term_str')
swords.columns = ['dummy']
swords.dummy = 1
swords.head()

In [None]:
VOCAB["stopword"] = VOCAB["term_str"].map(swords["dummy"])
VOCAB["stopword"] = VOCAB["stopword"].fillna(0).astype("int")
VOCAB.head()

In [None]:
# Redundant, but adding POS tag to VOCAB table

VOCAB["pos"] = VOCAB["term_str"].apply(lambda x: nltk.pos_tag([str(x)])[0][1])
VOCAB.head(10)

In [208]:
VOCAB_NO_SWORDS = VOCAB[VOCAB["stopword"] == 0]

In [209]:
VOCAB_NO_SWORDS.to_csv("../data/processed/VOCAB_STOP.csv")

In [210]:
VOCAB.to_csv("../data/processed/VOCAB.csv")

In [211]:
# VOCAB = VOCAB_NO_SWORDS

# III: Annotate and Convert to F3

Since the data is already annotated with POS and term strings, we will simply add lemmas and sentiment to this data to convert to F3 format. I will first add the lemmas using the `WordNetLemmatizer` from `nltk`, and then I will create sentiment scores on the document level and associate them with the tokens from that document. 

In [None]:
TOKEN.head()

In [None]:
# Add lemmas to TOKEN table
TOKEN["term_str"] = TOKEN["term_str"].apply(str)
TOKEN["lemma"] = TOKEN["term_str"].apply(lambda x: nltk.stem.WordNetLemmatizer().lemmatize(x))
TOKEN.sample(10)

In [None]:
# Using DOC to implement sentiment analysis
sa = SentimentIntensityAnalyzer()
DOC['sentiment'] = DOC["text_str"].apply(lambda x: sa.polarity_scores(x)["compound"])
DOC.head()

In [None]:
# Merging sentiment with TOKEN table

TOKEN = pd.merge(TOKEN.reset_index(), DOC[["text_num", "sentiment"]], how="inner", on="text_num").set_index("text_num").sort_index()
TOKEN.head()

In [216]:
TOKEN.to_csv("../data/processed/TOKEN.csv")

# IV: TFIDF and Convert to F4

In this section, I will be calculating the article-level TF-IDF and adding the relevant data to the `VOCAB` and `TOKEN` tables. 

In [None]:
# Filtering out weird hexcodes by filtering only for word characters
VOCAB = VOCAB[VOCAB['term_str'].str.match(r"\w+")]
VOCAB.head(20)

In [218]:
# Adding term_rank to vocab column
VOCAB = VOCAB.sort_values("count", ascending=False).reset_index()   # Sort by counts
VOCAB.index.name = "term_rank"  # Set the index to the term rank
VOCAB = VOCAB.reset_index()
VOCAB = VOCAB.set_index("term_id")
VOCAB["term_rank"] = VOCAB["term_rank"] + 1

In [None]:
VOCAB.head()

In [220]:
# Adding term_id to TOKEN_table
TOKEN["term_id"] = TOKEN["term_str"].map(VOCAB.reset_index().set_index("term_str")["term_id"])

In [None]:
TOKEN.head()

In [222]:
TOKEN.to_csv("../data/processed/TOKEN.csv")

In [None]:
# Computing TF-IDF at the article level
BOW = TOKEN.groupby(["text_num", "term_id"])["term_id"].count().to_frame().rename(columns={"term_id": "n"})
BOW["c"] = BOW["n"].astype("bool").astype("int")
BOW.head()

In [None]:
# Creating DTM
DTCM = BOW["n"].unstack().fillna(0).astype("int")
DTCM.head()

In [225]:
# Calculating TF-IDF
TF = (DTCM.T / DTCM.T.sum()).T

df = DTCM[DTCM > 0].sum()
n = DTCM.shape[0]
IDF = np.log10(n / df)

TFIDF = TF + IDF

In [226]:
TFIDF.to_csv("../data/processed/TFIDF.csv")

In [None]:
VOCAB_TFIDF = pd.merge(VOCAB, TFIDF.T, on="term_id")
VOCAB_TFIDF["tfidf_sum"] = TFIDF.T.sum(axis=1)
VOCAB_TFIDF.head()

In [None]:
VOCAB_TFIDF[['term_rank','term_str','pos','tfidf_sum']]\
    .sort_values('tfidf_sum', ascending=False).head(25).style.background_gradient("PuBuGn")

In [None]:
print(VOCAB_TFIDF[['term_rank','term_str','pos','tfidf_sum']]\
    .sort_values('tfidf_sum', ascending=False).head(20).to_latex())

In [230]:
VOCAB_TFIDF.to_csv("../data/processed/VOCAB_TFIDF.csv")

In [None]:
TOKEN_TFIDF = pd.merge(TOKEN.reset_index(), VOCAB_TFIDF["tfidf_sum"], on="term_id").set_index("text_num")
TOKEN_TFIDF.head()

In [232]:
TOKEN_TFIDF.to_csv("../data/processed/TOKEN_TFIDF.csv")

# V: PCA, LCA, and word2vec and Convert to F5

In [233]:
n_terms = 10000

In [234]:
top_TFIDF = VOCAB_TFIDF.sort_values("tfidf_sum", ascending=True).head(n_terms).reset_index()["term_id"].to_list()

In [None]:
filtered_DTM = DTCM.loc[:, top_TFIDF]
filtered_DTM.head()

## V.i PCA

In [None]:
COV = filtered_DTM.cov()
COV.head()

In [237]:
# Getting eigenvalues and eigenvectors
eig_vals, eig_vecs = eig(COV)


In [None]:
eig_vecs_table = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
eig_vecs_table.head()

In [239]:
eig_val_table = pd.DataFrame(eig_vals, index=COV.index, columns=["eig_val"])

In [None]:
eig_pairs = eig_val_table.join(eig_vecs_table.T)
eig_pairs.head()

In [241]:
eig_pairs["exp_var"] = np.round((eig_pairs["eig_val"] / eig_pairs["eig_val"].sum())*100, 2)

In [None]:
top_pcs = eig_pairs.sort_values("exp_var", ascending=False).head(10).reset_index(drop=True)
top_pcs.index.name = "comp_id"
top_pcs.index = [f"PC{i}" for i in top_pcs.index.to_list()]
top_pcs.head(10)

In [None]:
loadings = top_pcs[COV.index].T
loadings.index.name = "term_id"
loadings.head()

In [244]:
loadings['term_str'] = loadings.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)

In [None]:
loadings.head()

In [246]:
lb0_pos = loadings.sort_values('PC0', ascending=True).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb0_neg = loadings.sort_values('PC0', ascending=False).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb1_pos = loadings.sort_values('PC1', ascending=True).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb1_neg = loadings.sort_values('PC1', ascending=False).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb2_pos = loadings.sort_values('PC2', ascending=True).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb2_neg = loadings.sort_values('PC2', ascending=False).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb3_pos = loadings.sort_values('PC3', ascending=True).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')
lb3_neg = loadings.sort_values('PC3', ascending=False).head(5).term_str.to_list()#.term_str.str.cat(sep=' ')

In [None]:
print('Books PC0+', lb0_pos)
print('Books PC0-', lb0_neg)
print('Books PC1+', lb1_pos)
print('Books PC1-', lb1_neg)
print('Books PC2+', lb2_pos)
print('Books PC2-', lb2_neg)
print('Books PC3+', lb3_pos)
print('Books PC3-', lb3_neg)


print(pd.DataFrame({"PC0+": lb0_pos, "PC0-": lb0_neg, "PC1+": lb1_pos, "PC1-": lb1_neg, 
              "PC2+": lb2_pos, "PC2-": lb2_neg, "PC3+": lb3_pos, "PC3-": lb3_neg}).T.to_latex())

In [None]:
# Adding loadings to TOKEN and VOCAB tables

VOCAB_PCA = pd.merge(VOCAB_TFIDF, loadings.drop('term_str', axis=1), on="term_id")
VOCAB_PCA

In [None]:
TOKEN_PCA = pd.merge(TOKEN_TFIDF.reset_index(), loadings.drop("term_str", axis=1), on="term_id").set_index("text_num")
TOKEN_PCA

In [250]:
TOKEN_PCA.to_csv("../data/processed/TOKEN_PCA.csv")
VOCAB_PCA.to_csv("../data/processed/VOCAB_PCA.csv")

## V.ii LDA

In [None]:
ARTICLES = TOKEN[TOKEN["pos"].str.match(r'^NNS?$')]\
    .groupby(OHCO[:1]).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'article_str'})
ARTICLES.head()

In [252]:
# Create a vector space model
# Use sklearn's Count Vectorizer to convert our corpus of articles into a document-term vector space

tfv = CountVectorizer(max_features=n_terms, stop_words="english")
tf = tfv.fit_transform(ARTICLES["article_str"])
TERMS = tfv.get_feature_names_out()

In [253]:
n_topics = 40
lda = LDA(n_components=20, random_state=1819)

In [None]:
# Create Theta dataframe
THETA = pd.DataFrame(lda.fit_transform(tf), index=ARTICLES.index)
THETA.columns.name = "topic_id"
THETA.head()


In [None]:
# Create Phi dataframe
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = "topic_id"
PHI.columns.name = "term_str"
PHI = PHI.T
PHI.head()

In [None]:
# Taking top ten words for each topic based on weight
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'}).groupby('topic_id').apply(lambda x: x["weight"].sort_values(ascending=False).head(10).reset_index().drop('topic_id', axis=1)["term_str"])
TOPICS.head()

In [None]:
latex_topics = TOPICS.reset_index()[['topic_id', 0, 1, 2, 3, 4]]
latex_topics.index.name = ""
latex_topics.columns.name = ""
print(latex_topics.to_latex())

These topics are quite interesting. Topic 0 clearly seems to be related to research and data, while topic 1 appears to be related to astrophysics.

In [None]:
# Concatenating the topic words into a sentence and printing the result
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x[:7].astype('str')), 1)
TOPICS[["label"]]

In [259]:
TOPICS['doc_weight_sum'] = THETA.sum()

In [None]:
TOPICS.head()

In [None]:
TOPICS.sort_values('doc_weight_sum', ascending=True).plot.barh(y='doc_weight_sum', x='label',title="Document Importance by DWS", figsize=(5,10)) 

In [262]:
TOPICS.to_csv('../data/processed/TOPICS.csv')
THETA.to_csv('../data/processed/THETA.csv')
PHI.to_csv('../data/processed/PHI.csv')

## V.iii word2vec

In [263]:
# Making lists of words from token table per article
article_corpora = TOKEN[~TOKEN["pos"].str.match("NNPS?")].groupby("text_num")["term_str"].apply(lambda x: x.to_list()).reset_index()["term_str"].to_list()

In [264]:
# Making Word2Vec model out of corpora
article_model = Word2Vec(article_corpora, vector_size=256, window=8, min_count=10, workers=4, seed=1819)

In [265]:
def complete_analogy(A, B, C, model, n=2):
    try:
        return model.wv.most_similar(positive=[B, C], negative=[A])[:n]
    except KeyError as e:
        print("Error:", e)
        return None

In [None]:
complete_analogy("tool", "work", "computer", article_model)

In [None]:
complete_analogy("good", "computer", "bad", article_model)

In [None]:
complete_analogy("computer", "automation", "people", article_model)

In [None]:
complete_analogy("digital", "computer", "analog", article_model)

In [None]:
complete_analogy("computer", "smart", "people", article_model)

In [None]:
complete_analogy("computer", "data", "people", article_model)

In [None]:
# Building coords frame for TSNE plot
coords = pd.DataFrame(index=range(len(article_model.wv.key_to_index)))
coords["label"] = [word for word in article_model.wv.key_to_index]
coords["vector"] = coords["label"].apply(lambda x: article_model.wv.get_vector(x))
coords.head()

In [None]:
vec_df = pd.DataFrame(coords["vector"].to_list(), columns=range(256))
coords = coords.join(vec_df).drop("vector", axis=1).set_index('label')
coords.index.name = "term_str"
coords.head()

In [274]:
coords.to_csv("../data/processed/coords.csv")

## Shapes of our output data

In [None]:
import os

data = [TOKEN_TFIDF, VOCAB_TFIDF, DOC, LIB]

print("Shapes:")
print("TOKEN_TFIDF:", TOKEN_TFIDF.shape)
print("VOCAB_TFIDF:", VOCAB_TFIDF.shape)
print("DOC:", DOC.shape)
print("LIB:", LIB.shape)

# Summary

In this notebook, we used NLP techniques to convert the raw corpus into various useful tables, which have been saved into the directory `/data/processed/`. This data will be used in the file `data_exploration.ipynb`, as well as in the final report paper, to glean insights into the perception of the dawn of the digital age.