In [6]:
import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])

# Tính theo công thức
x_mean = np.mean(x)
y_mean = np.mean(y)

cov_x_y = np.sum((x - x_mean) * (y - y_mean)) / (
    np.sqrt(np.sum((x - x_mean) ** 2)) * np.sqrt(np.sum((y - y_mean) ** 2))
)

print("Correlation Coefficient: ", cov_x_y)

# Tính theo numpy
r = np.corrcoef(x, y)  # Trả về ma trận tương quan 2x2
print("Correlation Coefficient - 1: ", r[0][1])

Correlation Coefficient:  0.9999999999999998
Correlation Coefficient - 1:  0.9999999999999999


In [8]:
y2 = [10, 8, 6, 4, 2]

r2 = np.corrcoef(x, y2)
print("Correlation Coefficient - 2: ", r2[0][1])

Correlation Coefficient - 2:  -0.9999999999999999


In [10]:
x3 = np.linspace(0, 10, 100)
y3 = np.sin(x3)

r3 = np.corrcoef(x3, y3)
print("Correlation Coefficient - 3: ", r3[0][1])

Correlation Coefficient - 3:  -0.07589466694797196


In [11]:
feature = [1.1, 1.9, 3.2, 4.5, 5.1]
label = [1.0, 2.0, 3.0, 4.1, 5.3]

r4 = np.corrcoef(feature, label)[0][1]
print("Correlation Coefficient - 4: ", r4)

Correlation Coefficient - 4:  0.9900317180760644


In [12]:
height = [150, 160, 170, 180, 190]
weight = [50, 60, 70, 80, 90]

r5 = np.corrcoef(height, weight)[0][1]
print("Correlation Coefficient - 5: ", r5)

Correlation Coefficient - 5:  1.0


In [14]:
embed_A = [0.3, 0.5, 0.7, 0.8]
embed_B = [0.9, 1.4, 2.1, 2.4]

r6 = np.corrcoef(embed_A, embed_B)[0][1]
print("Correlation Coefficient - 6: ", r6)

Correlation Coefficient - 6:  0.9974174463431612


In [16]:
X = np.random.normal(loc=0, scale=1, size=100)
y = np.random.normal(loc=0, scale=1, size=100)

r7 = np.corrcoef(X, y)[0][1]
print("Correlation Coefficient - 7: ", r7)

Correlation Coefficient - 7:  0.034315278228610115


In [18]:
x = np.arange(100)
y_clean = x
y_noisy = x + np.random.normal(0, 10, 100)

r_x_y_clean = np.corrcoef(x, y_clean)[0][1]
r_x_y_noisy = np.corrcoef(x, y_noisy)[0][1]

print("Correlation Coefficient - X & y_clean: ", r_x_y_clean)
print("Correlation Coefficient - X & y_noisy: ", r_x_y_noisy)

Correlation Coefficient - X & y_clean:  0.9999999999999999
Correlation Coefficient - X & y_noisy:  0.9566415582776722


In [19]:
temperature = [22, 24, 23, 25, 26]
sales = [100, 110, 105, 115, 120]

r9 = np.corrcoef(temperature, sales)[0][1]
print("Correlation Coefficient - 9: ", r9)

Correlation Coefficient - 9:  1.0


In [21]:
doc1 = "deep learning for natural language processing"
doc2 = "transformer models improve language understanding"
doc3 = "convolutional neural networks for image classification"
query = "language models for text understanding"

In [20]:
import numpy as np
import math
from collections import Counter

In [29]:
# Tiền xử lý văn bản
def tokenize(text):
    return text.lower().split()


docs = [doc1, doc2, doc3]
all_docs = docs + [query]
tokenized_docs = [tokenize(doc) for doc in all_docs]

# Xây dựng tập từ vựng
vocab = sorted(set(word for doc in tokenized_docs for word in doc))
count = Counter(vocab)
print(count)


# Tính TF
def compute_tf(doc_tokens):
    count = Counter(doc_tokens)
    tf = [count[word] / len(doc_tokens) for word in vocab]
    return tf


# Tính IDF
def compute_idf(docs_tokens):
    N = len(docs_tokens)
    idf = []
    for word in vocab:
        df = sum(word in doc for doc in docs_tokens)
        idf.append(math.log((N) / (1 + df)) + 1)
    return idf


# Tính TF-IDF
def compute_tfidf(tf, idf):
    return [t * i for t, i in zip(tf, idf)]


tf_vectors = [compute_tf(doc) for doc in tokenized_docs]
idf_vector = compute_idf(tokenized_docs)
tfidf_vectors = [compute_tfidf(tf, idf_vector) for tf in tf_vectors]

# Lấy vector query và doc
query_vector = np.array(tfidf_vectors[-1])
doc_vectors = [np.array(vec) for vec in tfidf_vectors[:-1]]


# Tính hệ số tương quan Pearson (np.corrcoef)
def compute_pearson_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0
    return np.corrcoef(x, y)[0, 1]


# Tính tương quan và xếp hạng
correlations = [
    compute_pearson_corr(query_vector, doc_vector) for doc_vector in doc_vectors
]
results = sorted(zip(docs, correlations), key=lambda x: x[1], reverse=True)

# Kết quả
for i, (doc, score) in enumerate(results, 1):
    print(f"{i}. Score: {score:.4f} -> '{doc}'")

Counter({'classification': 1, 'convolutional': 1, 'deep': 1, 'for': 1, 'image': 1, 'improve': 1, 'language': 1, 'learning': 1, 'models': 1, 'natural': 1, 'networks': 1, 'neural': 1, 'processing': 1, 'text': 1, 'transformer': 1, 'understanding': 1})
1. Score: 0.2506 -> 'transformer models improve language understanding'
2. Score: -0.2041 -> 'deep learning for natural language processing'
3. Score: -0.3628 -> 'convolutional neural networks for image classification'
