In [1]:
!pip install gensim
import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-50")  # SADECE 66MB




In [8]:
def get_sentence_vector(text, model):
    words = text.lower().split()
    vectors = []
    for w in words:
        if w in model:
            vectors.append(model[w])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)  # hiç kelime yoksa sıfır vektör
    return np.mean(vectors, axis=0)  # kelime vektörlerinin ortalaması


In [9]:
def cosine_similarity_safe(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0  # sıfır vektör olunca 0 benzerlik
    return np.dot(vec1, vec2) / (norm1 * norm2)


In [10]:
df = pd.read_csv("sanatcilar_biyografi_lemmatized.csv")

# Kolon adın 'Lemmatized Words' dedin, ona göre vektörleri ekle
df['Vector'] = df['Lemmatized Words'].apply(lambda x: get_sentence_vector(str(x), model))


In [11]:
def get_top5_similar(input_vector, df):
    similarities = []
    for vec in df['Vector']:
        sim = cosine_similarity_safe(input_vector, vec)
        similarities.append(sim)
    df['Similarity'] = similarities
    top5 = df.sort_values(by='Similarity', ascending=False).head(5)
    return top5[['Lemmatized Words', 'Similarity']]


In [12]:
input_text = "Frida Kahlo ressam Meksikalı"
input_vector = get_sentence_vector(input_text, model)

top5 = get_top5_similar(input_vector, df)
print(top5)


       Lemmatized Words  Similarity
0                 frida    0.883045
161653            frida    0.883045
86                frida    0.883045
400               frida    0.883045
129386            frida    0.883045


In [13]:
print(df['Lemmatized Words'].apply(lambda x: len(str(x).split())).describe())


count    197614.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: Lemmatized Words, dtype: float64


In [14]:
print(df.columns)


Index(['Lemmatized Words', 'Vector', 'Similarity'], dtype='object')
