In [30]:
"""
Created on Tue Jan 16 23:47:38 2024

@author: tahat
"""

import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from collections import Counter
from subprocess import check_output 

In [31]:
df = pd.read_excel('output_cleaned_no_empty_rows.xlsx')
sentences = df['Cümle'].tolist()

In [32]:
corpus = []
for sentence in sentences:
    # Check if the value is a string before splitting
    if isinstance(sentence, str):
        corpus.append(sentence.split())
    else:
        # Handle other types (e.g., convert to string or use a different strategy)
        corpus.append([])
        
print(len(corpus))
print(len(sentences))

11744
11744


In [33]:
model = Word2Vec(corpus,vector_size=100,window=5,min_count=5,sg=1)
model.save('word2vec.model')

In [34]:
preprocessed_corpus = " ".join(map(str, sentences))
word_frequencies = Counter(preprocessed_corpus.split())
top_words = word_frequencies.most_common(20)

In [35]:
# Rapor
report = "En Fazla Geçen 20 Kelime:\n"
for word, frequency in top_words:
    report += f"{word}: {frequency} kez\n"

print(report)

En Fazla Geçen 20 Kelime:
movie: 2101 kez
like: 1281 kez
film: 900 kez
oppenheimer: 895 kez
really: 730 kez
nolan: 690 kez
scene: 649 kez
think: 619 kez
bomb: 605 kez
one: 602 kez
felt: 597 kez
time: 520 kez
people: 514 kez
didnt: 502 kez
much: 482 kez
see: 477 kez
would: 472 kez
im: 434 kez
also: 431 kez
good: 418 kez



In [36]:
# En sık geçen 20 kelimenin her biri için en benzer 5 kelimeyi bulun
similar_words_report = ""
sayac = 0
for word, frequency in top_words:
    sayac= sayac + 1
    similar_words_report += f"\nEn Benzer {sayac}. kelime '{word}' ({frequency} kez geçiyor):\n"
    
    try:
        similar_words = model.wv.most_similar(word, topn=5)
        for similar_word, similarity_score in similar_words:
            similar_words_report += f"{similar_word}: Benzerlik Skoru - {similarity_score}\n"
    except KeyError:
        similar_words_report += "Bu kelimenin Word2Vec modelde karşılığı yok.\n"

# Sonuçları raporlayın
print(similar_words_report)



En Benzer 1. kelime 'movie' (2101 kez geçiyor):
boring: Benzerlik Skoru - 0.9614768624305725
honestly: Benzerlik Skoru - 0.9590595364570618
fast: Benzerlik Skoru - 0.9555848836898804
overall: Benzerlik Skoru - 0.9549501538276672
film: Benzerlik Skoru - 0.9532986879348755

En Benzer 2. kelime 'like' (1281 kez geçiyor):
feels: Benzerlik Skoru - 0.8877969980239868
trailer: Benzerlik Skoru - 0.8835818767547607
little: Benzerlik Skoru - 0.8826219439506531
feel: Benzerlik Skoru - 0.8741145133972168
bit: Benzerlik Skoru - 0.8673768639564514

En Benzer 3. kelime 'film' (900 kez geçiyor):
probably: Benzerlik Skoru - 0.9756174683570862
heard: Benzerlik Skoru - 0.9710142016410828
worst: Benzerlik Skoru - 0.9708309769630432
favorite: Benzerlik Skoru - 0.968697726726532
honestly: Benzerlik Skoru - 0.9685655832290649

En Benzer 4. kelime 'oppenheimer' (895 kez geçiyor):
communist: Benzerlik Skoru - 0.9542362689971924
perspective: Benzerlik Skoru - 0.9382928609848022
lewis: Benzerlik Skoru - 0.93825

In [37]:
#Seçilen 5 cümle
selected_rows = [3628, 5202, 5904, 7781, 7927]
selected_sentences = df.iloc[selected_rows]['Cümle'].tolist()

In [38]:
# Her bir cümleye bir etiket (tag) atayın
tagged_data = [TaggedDocument(words=word_tokenize(sentence.lower()), tags=[str(idx)]) for idx, sentence in enumerate(sentences)]

# Doc2Vec modelini eğit
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Modeli kaydetmek (isteğe bağlı)
model.save("doc2vec_model")

In [39]:

# Seçilen satırlardaki cümlelere benzerlik sorgusu yapmak için
for selected_row in selected_rows:
    selected_sentence = df.loc[selected_row, 'Cümle']
    
    # Bir cümle üzerinde benzerlik sorgusu yapmak için
    query_vector = model.infer_vector(word_tokenize(selected_sentence.lower()))
    similar_sentences = model.dv.most_similar([query_vector], topn=5)

    # Seçilen cümleyi ve benzer cümleleri göster
    print(f"\nTeslim edilen derlemedeki {selected_row}. cümle: {selected_sentence}")
    print(f"Bu cümleye benzeyen cümleler:")
    for idx, (sentence_index, similarity_score) in enumerate(similar_sentences, start=1):
        similar_sentence = sentences[int(sentence_index)]
        print(f"En benzer {idx}. cümle (Benzerlik Skoru: {similarity_score}): {similar_sentence}")
        
query_vector = model.infer_vector(word_tokenize(selected_sentence.lower()))
similar_sentences = model.dv.most_similar([query_vector], topn=5)

# Benzer cümleleri göster
for idx, (sentence_index, similarity_score) in enumerate(similar_sentences, start=1):
    print(f"En benzer {idx}. cümle (Benzerlik Skoru: {similarity_score}): {sentences[int(sentence_index)]}")


Teslim edilen derlemedeki 3628. cümle: still good movie wasnt horrifying visually awesome hoping
Bu cümleye benzeyen cümleler:
En benzer 1. cümle (Benzerlik Skoru: 0.8060975670814514): negative sounding movie enjoyed thought good every time see nolan movie think hes one idea away making something great instead makes good movies still leagues ahead average movie hitting theater
En benzer 2. cümle (Benzerlik Skoru: 0.7529678344726562): theres three maybe four really good movies could expand upon throughout nolans trying tell point left reeling times
En benzer 3. cümle (Benzerlik Skoru: 0.7258433699607849): think best rdj performance need watch chaplin best performance
En benzer 4. cümle (Benzerlik Skoru: 0.7154932618141174): think overall good movie im completely honest wasnt invested first acts
En benzer 5. cümle (Benzerlik Skoru: 0.7149530649185181): watched movie felt like best christopher nolan movies years felt massive many actors yet felt intimate movie peeled layer oppenheimer we