# Final Project: Data Exploration
## DS 5001
### Author: Taylor Tucker


In this document, separate from `data_processing.ipynb`, I will be using the various techniques we have learned in order to analyze the given data. This notebook can be considered an extension of the aforementioned notebook.

In [25]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as sch
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
import plotly.express as px
import nltk
import seaborn as sns
from IPython.core.display import HTML

In [26]:
OHCO = ["text_num", "paragraph_num", "sentence_num", "token_num"]
TOKS = OHCO[:4]
SENTS = OHCO[:3]
PARAS = OHCO[:2]
TEXTS = OHCO[:1]

In [None]:
DOC = pd.read_csv('../data/processed/DOC.csv', index_col=0)
DOC.head()

In [None]:
LIB = pd.read_csv("../data/processed/LIB.csv", index_col=0)
LIB.head()

In [None]:
TOKEN_TFIDF = pd.read_csv("../data/processed/TOKEN_TFIDF.csv", index_col=0)
TOKEN_TFIDF.head()

In [None]:
VOCAB_TFIDF = pd.read_csv("../data/processed/VOCAB_TFIDF.csv")
VOCAB_TFIDF.head()

# VI: Explore the Results

## Hierarchical Cluster Diagrams

In [None]:
# Creating Document-Pair Matrix
PAIRS = pd.DataFrame(index=pd.MultiIndex.from_product([DOC.index.tolist(), DOC.index.tolist()])).reset_index()
PAIRS = PAIRS[PAIRS.level_0 < PAIRS.level_1].set_index(["level_0", "level_1"])
PAIRS.index.names = ["doc_a", "doc_b"]
PAIRS.head()

In [32]:
TFIDF = pd.read_csv("../data/processed/TFIDF.csv").set_index("text_num")

In [None]:
methods = ["euclidean", "cosine", "jaccard", "minkowski"]

for method in methods:
    PAIRS[method] = pdist(TFIDF, method)

PAIRS.head()

In [34]:
def hca(sims, linkage_method='ward', color_thresh=.3, figsize=(10, 10)):
    tree = sch.linkage(sims, method=linkage_method)
    labels = DOC["title"].values
    plt.figure()
    fig, axes = plt.subplots(figsize=figsize)
    dendrogram = sch.dendrogram(tree, 
                                labels=labels, 
                                orientation="left", 
                                count_sort=True,
                                distance_sort=True,
                                above_threshold_color='.75',
                                color_threshold=color_thresh)
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
hca(PAIRS["jaccard"], color_thresh=10, figsize=(100, 200))

## Heatmaps

In [36]:
lib_sorted = LIB.sort_values("pub_year", ascending=True)


## Dispersion Plots

In [None]:
%matplotlib inline
disp_fig = nltk.draw.dispersion_plot(TOKEN_TFIDF["term_str"].to_list(), ["computer", "analog", "digital"], title="Ocurrences of Key Words in Corpus")

The table `TOKEN_TFIDF` is sorted by year, so this gives us a good representation of usage over time. It is interesting to note that the use of "computer" almost seems to come in waves. 

## Time Series Plots

In [None]:
TOKEN_TFIDF

In [None]:
comp_year = TOKEN_TFIDF[TOKEN_TFIDF["term_str"] == "computer"].merge(LIB, on="text_num")[["term_str", "pub_year", "sentiment"]].reset_index().drop("text_num", axis=1).groupby("pub_year").agg({'term_str': ['count'], 'sentiment': ['mean']})
comp_year.head()

In [40]:
comp_year.columns = ["count", "mean_sentiment"]
comp_year["rolling_count"] = comp_year["count"].rolling(window=3, center=True).mean()
comp_year["rolling_sentiment"] = comp_year["mean_sentiment"].rolling(window=3, center=True).mean()

In [None]:
comp_year

In [None]:
plt.style.use('seaborn-v0_8-paper')
ax1 = sns.lineplot(comp_year, x="pub_year", y="rolling_count")
ax1.set_ylabel("Count")

ax2 = ax1.twinx()
sns.lineplot(comp_year, x="pub_year", y="rolling_sentiment", color="orange")
ax2.set_ylabel("Sentiment")

plt.title("Smoothed Average Sentiment and Occurances of 'Computer' over Time")



plt.show()

## t-SNE Plots

In [None]:
coords = pd.read_csv("../data/processed/coords.csv")
n_samples = 1000
coords.head()

In [44]:
coords = coords.set_index("term_str")

In [None]:
comp_vec = coords.loc["computer", :]
sample_coords = coords.sample(n_samples, random_state=1819).reset_index()
sample_coords.iloc[-1] = ["computer"] + comp_vec.to_list()
sample_coords.head()

In [None]:
vecs = np.array([np.array(sample_coords.iloc[i, 1:].to_list(), dtype="float32") for i in range(sample_coords.shape[0])], dtype="float32")


tsne = TSNE(perplexity=50, n_components=2, init="pca", n_iter=2000, random_state=1819)

tsne_values = tsne.fit_transform(vecs)

sample_coords["x"] = tsne_values[:, 0]
sample_coords["y"] = tsne_values[:, 1]

In [None]:
colors = ["black" for _ in range(sample_coords.shape[0]-1)] + ["red"]
sample_coords["color"] = colors
sample_coords.tail()

In [None]:
tsne_fig = px.scatter(sample_coords, "x", "y", text="term_str", height=1000).update_traces(mode="text", textfont_color=colors)
tsne_fig.show()

In [49]:
# tsne_fig.write_image("../media/tsne_official.png")

## PCA Loadings

In [50]:
TOKEN_PCA = pd.read_csv("../data/processed/TOKEN_PCA.csv")

In [51]:
def plot_pcas(token_pca, pc1, pc2, discrete_text=False):
    if discrete_text:
        token_pca["text_num"] = token_pca["text_num"].astype("str")
    else:
        token_pca["text_num"] = token_pca["text_num"].astype("int")

    pc1 = "PC" + str(pc1)
    pc2 = "PC" + str(pc2)
    fig = px.scatter(token_pca, pc1, pc2, color="text_num", hover_name="term_str", range_x=(-0.2, 0.2), range_y=(-0.1, 0.1))
    return fig

In [None]:
plot_pcas(TOKEN_PCA, 1, 2)

In [None]:
plot_pcas(TOKEN_PCA, 5, 8, discrete_text=True)

## Sentiment

In [None]:
salex = pd.read_csv("../data/salex/salex_nrc.csv")
salex.columns = [col.replace("nrc_", "") for col in salex.columns]
salex["polarity"] = salex["positive"] - salex["negative"]
salex.head()

In [None]:
TOKEN_TFIDF

In [None]:
articles = pd.merge(LIB.reset_index(), TOKEN_TFIDF.reset_index(), how="inner", on="text_num")
articles

In [None]:
articles = pd.merge(articles, salex, on="term_str", how="left")
articles

In [None]:
articles[salex.columns].sample(10)

In [None]:
articles.columns

In [None]:
emotion_cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'polarity']
articles = articles.drop(["author", "pub_year", "pub_mon", "pub_day", "pos_tuple", 'lemma', 'sentiment', 'term_id', 'tfidf_sum', "negative", "positive", "title"], axis=1)
articles[emotion_cols] = articles[emotion_cols].fillna(0.0)
articles = articles.set_index(SENTS)
articles.head()

### Article Emotions

In [None]:
plt.style.use('seaborn-v0_8-paper')
articles[emotion_cols].mean().sort_values().plot.barh()

### Visualize Emotion

In [62]:
def class_word(x):
    if "computer" in str(x["term_str"]):
        val = 2
    else:
        val = int(np.sign(x["polarity"]))

    ts = x["token_str"]

    return "<span class='sent{}'>{}</span>".format(val, ts)

In [63]:
# articles["html"] = articles.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x["polarity"])), x.token_str), 1)
articles["html"] = articles.apply(lambda x: class_word(x), 1)

In [None]:
articles

In [None]:
sents = articles.groupby(SENTS)[emotion_cols].mean()
sents

In [66]:
sents["sent_str"] = articles.groupby(SENTS)["term_str"].apply(lambda x: x.str.cat(sep=" "))
sents['html_str'] = articles.groupby(SENTS)["html"].apply(lambda x: x.str.cat(sep=' '))

def sample_sentences(df, emo="polarity"):
    rows = []
    for idx in df.sample(10).index:

        valence = round(df.loc[idx, emo], 4)  
        word = df.loc[idx, "html_str"]   
        t = 0
        if valence > t: color = '#ccffcc'
        elif valence < t: color = '#ffcccc'
        # elif "computer" in word: color = "#ffff66"
        else: color = '#f2f2f2'
        z=0
        rows.append("""<tr style="background-color:{0};padding:.5rem 1rem;font-size:110%;">
        <td>{1}</td><td>{3}</td><td width="400" style="text-align:left;">{2}</td>
        </tr>""".format(color, valence, word, idx))

    display(HTML('<style>#sample1 td{font-size:120%;vertical-align:top;} .sent-1{color:red;font-weight:bold;} .sent1{color:green;font-weight:bold;} .sent2{color:yellow;font-weight:bold;}</style>'))
    display(HTML('<table id="sample1"><tr><th>Sentiment</th><th>ID</th><th width="600">Sentence</th></tr>'+''.join(rows)+'</table>'))

In [None]:
sample_sentences(sents)

In [None]:
sample_sentences(sents)

### Computer Article Sentiment over Time

In [None]:
articles_yrs = pd.merge(articles, LIB, how="inner", on="text_num")
articles_yrs = articles_yrs[["term_str", "pub_year"] + emotion_cols]
articles_yrs

In [None]:

def plot_sentiment_time(df, emo="polarity", rolling=False, rolling_val=3):
    gd = df.groupby("pub_year")[emo].mean().reset_index()

    if rolling:
        gd[emo] = gd[emo].rolling(window=rolling_val, center=True).mean()
    plt.style.use('seaborn-v0_8-paper')
    fig = sns.lineplot(gd, x="pub_year", y=emo)
    fig.set_title("Emotion over Time")
    fig.set_xlabel("Publishing Year")
    fig.set_ylabel(emo.capitalize())
    return fig

plot_sentiment_time(articles_yrs, emo="trust", rolling=True)