In [2]:
import requests
import numpy as np
import sklearn as sk
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.random import rand

API_KEY = "pub_4dbeb1c8439e481cb04f5e6f2aaafe55"

url = "https://newsdata.io/api/1/news"

params = {
    "apikey": API_KEY,
    "q": "Ukraine",
    "language": "en"
}

response = requests.get(url, params=params)

data = response.json()

print(data.keys())

dict_keys(['status', 'totalResults', 'results', 'nextPage'])


In [3]:
articles = data['results']
# print(articles)
new_data = []
for article in articles:
  new_data.append(article.get('title') + " " + article.get('description'))

In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
vec_data = vectorizer.fit_transform(new_data)

In [5]:
cov_matrix = B = vec_data.T @ vec_data

def find_q_vector(cov_matrix, rand_vector, k, list_of_eigenvectors):
  for i in range(k):
    rand_vector = cov_matrix @ rand_vector
    rand_vector = rand_vector / np.linalg.norm(rand_vector)
    rand_vector = reorthogonalize(rand_vector, list_of_eigenvectors)
  return rand_vector

def reorthogonalize(vector_to_orth, list_of_vectors):
    for vector in list_of_vectors:
        vector_to_orth = vector_to_orth - (vector.T @ vector_to_orth) * vector

    vector_to_orth = vector_to_orth / np.linalg.norm(vector_to_orth)

    return vector_to_orth

def find_eigenvector(cov_matrix, k, n):

  #Create list of eigenvectors for future
  list_of_eigenvectors = []

  for _ in range(n):
    #Create random_vector
    rand_vector = np.random.rand(cov_matrix.shape[1])
    rand_vector = rand_vector / np.linalg.norm(rand_vector)

    #Reorthogonalize
    # rand_vector = reorthogonalize(rand_vector, list_of_eigenvectors)

    #Power mehod
    lambda_vector = find_q_vector(cov_matrix, rand_vector, k, list_of_eigenvectors)

    #Reorthogonalize again
    lambda_vector = reorthogonalize(lambda_vector, list_of_eigenvectors)

    list_of_eigenvectors.append(lambda_vector)

  return list_of_eigenvectors


In [12]:
eigenvectors = find_eigenvector(cov_matrix, 100, 20)

In [7]:
# ── Setup ─────────────────────────────────────────────────────────────────────

np.random.seed(42)

# Replace with your actual vec_data if you have it
vec_data = np.random.rand(200, 100)

N_COMPONENTS = 5
K_POWER_ITER = 50

cov_matrix = vec_data.T @ vec_data

# ── Get your eigenvectors ─────────────────────────────────────────────────────

your_eigenvectors = find_eigenvector(cov_matrix, k=K_POWER_ITER, n=N_COMPONENTS)

# ── Get GROUND TRUTH from numpy ───────────────────────────────────────────────
# numpy's eigh is exact for symmetric matrices (cov_matrix is symmetric)
# returns eigenvalues ascending, so we flip to get descending

true_eigenvalues, true_eigenvectors = np.linalg.eigh(cov_matrix)
true_eigenvalues  = true_eigenvalues[::-1][:N_COMPONENTS]
true_eigenvectors = true_eigenvectors[:, ::-1][:, :N_COMPONENTS].T  # shape: (n, features)

# Your eigenvalues via Rayleigh quotient
your_eigenvalues = np.array([v @ cov_matrix @ v for v in your_eigenvectors])


# ── TEST 1: Are they actual eigenvectors? ─────────────────────────────────────
# A true eigenvector satisfies: cov_matrix @ v = λ * v
# So residual = ||cov_matrix @ v - λ * v|| should be ~0

print("=" * 60)
print("  TEST 1: Eigenvector Residual")
print("  cov @ v - λv should be ~0 for a true eigenvector")
print("  < 0.001 = ✅   < 0.01 = ⚠️    > 0.01 = ❌")
print("=" * 60)

for i in range(N_COMPONENTS):
    v  = your_eigenvectors[i]
    lam = your_eigenvalues[i]
    residual = np.linalg.norm(cov_matrix @ v - lam * v)
    flag = "✅" if residual < 0.001 else "⚠️ " if residual < 0.01 else "❌"
    print(f"  v{i+1}  residual = {residual:.6f}  {flag}")


  TEST 1: Eigenvector Residual
  cov @ v - λv should be ~0 for a true eigenvector
  < 0.001 = ✅   < 0.01 = ⚠️    > 0.01 = ❌
  v1  residual = 0.000000  ✅
  v2  residual = 0.549440  ❌
  v3  residual = 0.677793  ❌
  v4  residual = 0.251662  ❌
  v5  residual = 0.416259  ❌


In [24]:
#Finding eigenvalues:
eigenvectors = np.array(eigenvectors)

eigenvalues = np.array([v @ cov_matrix @ v for v in eigenvectors])
eigeval_eigenvec_sorted = sorted(zip(eigenvalues, eigenvectors), key=lambda x: x[0], reverse=True)


In [41]:
#Finding topics:
vocab = vectorizer.get_feature_names_out()
def get_sentances(list_of_eigenvectors):
    list_of_eigenvectors = [pair[1] for pair in eigeval_eigenvec_sorted]
    list_of_topics = []
    for el in list_of_eigenvectors:
        topic_word_weigths = zip(el, vocab)
        sorted_weigths_sentances = sorted(topic_word_weigths, key=lambda x: x[0], reverse=True)
        topic = ''
        for word in sorted_weigths_sentances[:10]:
            topic += ' ' + word[1]
        list_of_topics.append(topic)
    return list_of_topics

In [43]:
result = get_sentances(eigeval_eigenvec_sorted)
result

[' forces iranread cortina afpid 18 games anthem concluded appeared envoy',
 ' envoy 344 category absence 1771400648258569500wed 18 february jared dubious fell',
 ' invasion cities main 07 launched attacks iranread 100 february 30kyiv',
 ' fell market main 30kyiv 11 experts iran conflictrussiaukraine envoys juggle',
 ' attack kushner author foreign 02 games exchange expressed force market',
 ' eastern 344 games forces kushner hundreds attacks flown killed juggle',
 ' hosts end deployed adding 2026 february jared iranread discussions 07',
 ' juggle 02 launches cautious absence increasingly cities brokered article fighting',
 ' kyiv attacks afpid began 2026 jared interview efforts discussions games',
 ' conflict dollar 07 02 126 hours 344 barrage law attack',
 ' belgorod attack 02 efforts frontier 2014 brokered 100 air 2022',
 ' jared exchange 1771400648258569500wed 100 border began day decade 126 launched',
 ' exerting market devastated dollar article 30kyiv category games belgorod 344'