In [25]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [26]:
data_source = "news"
data_similarities_file = "news"

In [27]:
data = pd.read_csv(f"{data_source}/{data_similarities_file}_similarities.csv")
data["year"] = data["year"].astype(float).astype(int)
data["month"] = data["month"].astype(float).astype(int)

# only keep the rows with year >= 2018
data = data[data["year"] >= 2018]

# sort the data by year and month
# Fill NaN values with mean of each column
data = data.fillna(data.mean())
data = data.set_index(["year", "month"]).sort_index().reset_index()
columns_with_similarity = [col for col in data.columns if col.startswith("similarity_")]
data_features = data[columns_with_similarity]

Unnamed: 0,year,month,similarity_lex_avg_word_length,mean_lex_avg_word_length,similarity_lex_avg_sent_length_by_char,mean_lex_avg_sent_length_by_char,similarity_lex_avg_sent_length_by_word,mean_lex_avg_sent_length_by_word,similarity_lex_special_char_count,mean_lex_special_char_count,...,similarity_liwc_relig,mean_liwc_relig,similarity_liwc_death,mean_liwc_death,similarity_liwc_assent,mean_liwc_assent,similarity_liwc_nonfl,mean_liwc_nonfl,similarity_liwc_filler,mean_liwc_filler
0,2011,7,,5.28125,,45.6,,7.4,,0.0,...,,0.081081,,0.0,,0.0,,0.0,,0.0
1,2011,8,0.233142,5.91391,4697.464035,124.754762,102.422473,19.764286,0.00098,0.032719,...,7.9e-05,0.002381,0.0,0.0,0.0,0.0,2e-05,0.00119,0.0,0.0
2,2011,9,0.416519,6.250958,5156.67815,125.62381,184.630789,20.367857,0.005571,0.05893,...,0.000278,0.006609,0.0,0.0,0.0,0.0,1.2e-05,0.000928,0.0,0.0
3,2011,10,0.605618,6.238844,2045.848595,93.772876,51.479161,14.620915,0.002959,0.028932,...,0.000507,0.013897,0.0,0.0,0.0,0.0,2.6e-05,0.001225,0.0,0.0
4,2011,11,0.418261,6.045334,3566.793745,107.626245,86.878135,17.57318,0.002748,0.033068,...,0.000359,0.006531,0.0,0.0,0.0,0.0,1e-06,0.000227,0.0,0.0


In [31]:
data.to_csv(
    f"{data_source}/{data_similarities_file}_clean_similarities.csv", index=False
)

In [32]:
data_features = StandardScaler().fit_transform(data_features)
pca = PCA(n_components=0.85)

principalComponents = pca.fit_transform(data_features)
# number of components
print("Explained variance ratio for each component:")
print(pca.explained_variance_ratio_)
print("\nTotal explained variance ratio:")
print(pca.explained_variance_ratio_.sum())
print("\nNumber of components needed to explain 85% of variance:")
print(pca.n_components_)

Explained variance ratio for each component:
[0.23303152 0.09474809 0.04154215 0.03345316 0.03214925 0.02790614
 0.02652047 0.02406604 0.02256349 0.02138934 0.01983519 0.01942404
 0.01885284 0.01793673 0.01677081 0.01571898 0.01517242 0.01451482
 0.01383936 0.01341952 0.0129758  0.01221734 0.01191436 0.0117087
 0.01150631 0.01075176 0.01044019 0.01032904 0.00968929 0.00955412
 0.00914137 0.00858644]

Total explained variance ratio:
0.8516690769928481

Number of components needed to explain 85% of variance:
32


In [33]:
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)],
    index=columns_with_similarity,
)
loadings = loadings.applymap(lambda x: x**2)
# for each principal component, get the most important feature and its contribution
with open(f"{data_source}/{data_similarities_file}_pca.txt", "w") as f:
    for i in range(1, pca.n_components_ + 1):
        f.write(f"PCA {i}\n")
        # the first three features with the highest contribution
        f.write(f"{loadings[f'PC{i}'].nlargest(5).to_string()}\n")
        f.write("-" * 50 + "\n")

  loadings = loadings.applymap(lambda x: x**2)


In [34]:
new_data = pd.DataFrame(
    principalComponents, columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]
)
new_data["year"] = data["year"]
new_data["month"] = data["month"]

new_data.to_csv(f"{data_source}/{data_similarities_file}_pca.csv", index=False)

In [35]:
# new_data["time"] = new_data["year"].astype(str) + "-" + new_data["month"].astype(str)

In [36]:
# # for each of the PC columns, draw a line plot
# for i in range(1, pca.n_components_ + 1):
#     fig = px.line(
#         new_data,
#         x="time",
#         y=f"PC{i}",
#         title=f"Principal Component {i}",
#         labels={"time": "Time", f"PC{i}": f"Principal Component {i}"},
#     )
#     fig.show()