In [25]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [26]:
data_source = "papers"
data_similarities_file = "cl_cv_papers"

In [27]:
data = pd.read_csv(f"{data_source}/{data_similarities_file}_similarities.csv")
data["year"] = data["year"].astype(float).astype(int)
data["month"] = data["month"].astype(float).astype(int)
data.head()

Unnamed: 0,year,month,similarity_lex_avg_word_length,mean_lex_avg_word_length,similarity_lex_avg_sent_length_by_char,mean_lex_avg_sent_length_by_char,similarity_lex_avg_sent_length_by_word,mean_lex_avg_sent_length_by_word,similarity_lex_special_char_count,mean_lex_special_char_count,...,similarity_liwc_relig,mean_liwc_relig,similarity_liwc_death,mean_liwc_death,similarity_liwc_assent,mean_liwc_assent,similarity_liwc_nonfl,mean_liwc_nonfl,similarity_liwc_filler,mean_liwc_filler
0,2007,5,0.341657,7.242399,1688.271245,146.315946,34.419985,21.648887,0.004122,0.146537,...,1.5e-05,0.000867,1.301408e-07,2e-05,3e-06,0.000145,1.1e-05,0.000972,2e-06,0.000178
1,2007,6,0.278744,7.193838,1619.753472,151.791667,28.753472,22.458333,0.004201,0.135119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2007,7,0.04301,6.961461,3612.5,164.5,128.0,26.0,3.3e-05,0.107026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000386,0.013889,0.0,0.0
3,2007,8,,7.351852,,143.5,,21.0,,0.095238,...,,0.011905,,0.0,,0.0,,0.011905,,0.0
4,2007,9,0.320511,7.550521,704.964493,155.626263,20.953704,22.388889,0.000543,0.147322,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# only keep the rows with year >= 2018
data = data[data["year"] >= 2018]

In [29]:
# sort the data by year and month
# Fill NaN values with mean of each column
data = data.fillna(data.mean())
data = data.set_index(["year", "month"]).sort_index().reset_index()
columns_with_similarity = [col for col in data.columns if col.startswith("similarity_")]
data_features = data[columns_with_similarity]

In [30]:
data

Unnamed: 0,year,month,similarity_lex_avg_word_length,mean_lex_avg_word_length,similarity_lex_avg_sent_length_by_char,mean_lex_avg_sent_length_by_char,similarity_lex_avg_sent_length_by_word,mean_lex_avg_sent_length_by_word,similarity_lex_special_char_count,mean_lex_special_char_count,...,similarity_liwc_relig,mean_liwc_relig,similarity_liwc_death,mean_liwc_death,similarity_liwc_assent,mean_liwc_assent,similarity_liwc_nonfl,mean_liwc_nonfl,similarity_liwc_filler,mean_liwc_filler
0,2018,1,0.206911,7.243522,1002.532135,153.769020,19.474501,22.481583,0.001751,0.152994,...,0.000010,0.001844,1.701667e-06,0.000192,1.264216e-06,0.000138,0.000005,0.000662,0.000003,0.000331
1,2018,2,0.201342,7.232860,962.357332,152.551368,20.056789,22.398542,0.001863,0.152748,...,0.000011,0.001863,7.973867e-07,0.000099,1.172148e-07,0.000025,0.000005,0.000754,0.000002,0.000261
2,2018,3,0.206050,7.243884,2759.539863,159.308124,55.978232,23.283996,0.001819,0.157493,...,0.000012,0.002268,1.380895e-06,0.000132,1.086183e-06,0.000099,0.000009,0.000906,0.000002,0.000331
3,2018,4,0.197349,7.252651,2046.874101,155.983781,43.696324,22.751282,0.001867,0.156761,...,0.000014,0.002185,1.175372e-06,0.000115,1.688866e-06,0.000137,0.000007,0.001022,0.000003,0.000386
4,2018,5,0.189660,7.272180,971.289730,157.031027,19.812239,22.921228,0.001995,0.158799,...,0.000012,0.001910,5.397115e-07,0.000060,5.543952e-07,0.000076,0.000009,0.001242,0.000002,0.000218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,2024,7,0.208091,7.570956,1683.041999,163.048669,30.542547,22.552175,0.002442,0.181090,...,0.000017,0.003261,1.379813e-06,0.000078,4.133734e-06,0.000201,0.000003,0.000505,0.000003,0.000475
79,2024,8,0.199637,7.588980,1410.598432,163.264160,23.188148,22.520564,0.002429,0.181487,...,0.000015,0.003118,1.399755e-06,0.000135,1.626375e-06,0.000144,0.000004,0.000546,0.000004,0.000532
80,2024,9,0.207122,7.574413,801.308426,162.762355,15.376336,22.491951,0.002331,0.180819,...,0.000017,0.003333,1.534584e-06,0.000110,1.902829e-06,0.000162,0.000004,0.000567,0.000004,0.000561
81,2024,10,0.199337,7.578550,1509.635991,164.356020,26.359782,22.689774,0.002275,0.178959,...,0.000021,0.003469,1.401350e-06,0.000102,1.177266e-06,0.000105,0.000004,0.000569,0.000004,0.000583


In [31]:
data.to_csv(
    f"{data_source}/{data_similarities_file}_clean_similarities.csv", index=False
)

In [32]:
data_features = StandardScaler().fit_transform(data_features)
pca = PCA(n_components=0.85)

principalComponents = pca.fit_transform(data_features)
# number of components
print("Explained variance ratio for each component:")
print(pca.explained_variance_ratio_)
print("\nTotal explained variance ratio:")
print(pca.explained_variance_ratio_.sum())
print("\nNumber of components needed to explain 85% of variance:")
print(pca.n_components_)

Explained variance ratio for each component:
[0.21060099 0.04014095 0.03816853 0.02931182 0.02642738 0.02504387
 0.02385658 0.02228046 0.02053492 0.01964607 0.01913249 0.01831109
 0.01761816 0.01710608 0.0163671  0.01575633 0.01561931 0.01462544
 0.01427476 0.01390193 0.01369704 0.01347473 0.01305547 0.01302365
 0.01269485 0.01203844 0.01199575 0.0116204  0.01124421 0.01066039
 0.01060199 0.01038124 0.00977572 0.00927385 0.00897691 0.00879072
 0.00843773 0.00827853 0.00812188 0.00791654 0.00749577 0.00738591
 0.00702629]

Total explained variance ratio:
0.8546922678227705

Number of components needed to explain 85% of variance:
43


In [33]:
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)],
    index=columns_with_similarity,
)
loadings = loadings.applymap(lambda x: x**2)
# for each principal component, get the most important feature and its contribution
with open(f"{data_source}/{data_similarities_file}_pca.txt", "w") as f:
    for i in range(1, pca.n_components_ + 1):
        f.write(f"PCA {i}\n")
        # the first three features with the highest contribution
        f.write(f"{loadings[f'PC{i}'].nlargest(5).to_string()}\n")
        f.write("-" * 50 + "\n")


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



In [34]:
new_data = pd.DataFrame(
    principalComponents, columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]
)
new_data["year"] = data["year"]
new_data["month"] = data["month"]

new_data.to_csv(f"{data_source}/{data_similarities_file}_pca.csv", index=False)

In [35]:
new_data["time"] = new_data["year"].astype(str) + "-" + new_data["month"].astype(str)

In [36]:
# for each of the PC columns, draw a line plot
for i in range(1, pca.n_components_ + 1):
    fig = px.line(
        new_data,
        x="time",
        y=f"PC{i}",
        title=f"Principal Component {i}",
        labels={"time": "Time", f"PC{i}": f"Principal Component {i}"},
    )
    fig.show()