In [1]:
import gensim
from gensim.models import Word2Vec
import pandas as pd

In [2]:
parameters = [
    {'model_type': 'cbow', 'window': 2, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 2, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 300},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 300}
]

In [3]:
df_lemmatized = pd.read_csv("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/data/processed/preprocessed_data_lemmatized_only.csv")
df_stemmed = pd.read_csv("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/data/processed/preprocessed_data_stemmed_only.csv")

import ast
df_lemmatized['comments_processed'] = df_lemmatized['comments_processed'].apply(ast.literal_eval)
df_stemmed['comments_stemmed'] = df_stemmed['comments_stemmed'].apply(ast.literal_eval)

In [5]:
df_lemmatized

Unnamed: 0,pdate,pdate 2,marka,cheader,cheader 2,comments,comments_processed
0,Jul 20,2017,Fiesta,Transmission Failed,"39,234 miles","Since 2011, there have been numerous issues. T...","[[since, 2011, there, have, been, numerous, is..."
1,Mar 18,2017,Fiesta,Transmission Failed,"50,013 miles","I have had this car for just over a year, but ...","[[i, have, had, this, car, for, just, over, a,..."
2,Jan 06,2017,Fiesta,Transmission Failed,"37,000 miles",I will NEVER be purchasing a Ford vehicle or i...,"[[i, will, never, be, purchasing, a, ford, veh..."
3,Oct 20,2016,Fiesta,Transmission Failed,"106,000 miles",I was reading a comment posted by another 2011...,"[[i, wa, reading, a, comment, posted, by, anot..."
4,Mar 14,2015,Fiesta SE 4 cyl,Transmission Failed,"42,000 miles",We bought this Fiesta in February 2015. We hav...,"[[we, bought, this, fiesta, in, february, 2015..."
...,...,...,...,...,...,...,...
443,Oct 01,2011,Corolla CE 1.8L,Engine Uses Excessive Oil,"95,000 miles",This is BS. I google 2002 corolla burning oil ...,"[[this, is, b], [i, google, 2002, corolla, bur..."
444,May 07,2010,Corolla S,Engine Uses Excessive Oil,"136,000 miles",I purchased a 2002 Toyota Corolla S in 2001. G...,"[[i, purchased, a, 2002, toyota, corolla, s, i..."
445,Oct 01,2007,Corolla LE 1zz-Fe,Engine Uses Excessive Oil,"93,206 miles",I purchased this 2002 Toyota Corolla new. It s...,"[[i, purchased, this, 2002, toyota, corolla, n..."
446,Oct 15,2010,Corolla,Engine Uses Excessive Oil,"90,000 miles",I bought my Corolla used from a person I knew....,"[[i, bought, my, corolla, used, from, a, perso..."


In [6]:
# İç içe listeleri düz bir token listesi haline getir (Word2Vec için)
tokenized_corpus_lemmatized = [sentence for comment in df_lemmatized["comments_processed"] for sentence in comment]
tokenized_corpus_stemmed = [sentence for comment in df_stemmed["comments_stemmed"] for sentence in comment]

In [11]:
from gensim.models import Word2Vec

def train_and_save_model(corpus, param, model_prefix):
    model_type = param['model_type']
    vector_size = param['vector_size']
    window = param['window']
    
    # CBOW (sg=0) veya Skip-gram (sg=1)
    sg = 0 if model_type == 'cbow' else 1

    model = Word2Vec(
        sentences=corpus,
        vector_size=vector_size,
        window=window,
        min_count=1,
        workers=4,
        sg=sg
    )

    model_filename = f"{model_prefix}_{model_type}_vs{vector_size}_w{window}.model"
    model.save(model_filename)
    print(f"Model saved as {model_filename}")


In [12]:
# Lemmatize edilmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_lemmatized, param, "lemmatized_model")

# Stemlenmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_stemmed, param, "stemmed_model")

Model saved as lemmatized_model_cbow_vs100_w2.model
Model saved as lemmatized_model_skipgram_vs100_w2.model
Model saved as lemmatized_model_cbow_vs100_w4.model
Model saved as lemmatized_model_skipgram_vs100_w4.model
Model saved as lemmatized_model_cbow_vs300_w2.model
Model saved as lemmatized_model_skipgram_vs300_w2.model
Model saved as lemmatized_model_cbow_vs300_w4.model
Model saved as lemmatized_model_skipgram_vs300_w4.model
Model saved as stemmed_model_cbow_vs100_w2.model
Model saved as stemmed_model_skipgram_vs100_w2.model
Model saved as stemmed_model_cbow_vs100_w4.model
Model saved as stemmed_model_skipgram_vs100_w4.model
Model saved as stemmed_model_cbow_vs300_w2.model
Model saved as stemmed_model_skipgram_vs300_w2.model
Model saved as stemmed_model_cbow_vs300_w4.model
Model saved as stemmed_model_skipgram_vs300_w4.model


In [14]:
# Model dosyalarını yüklemek
model_1 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_cbow_vs100_w2.model")
model_2 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_skipgram_vs100_w2.model")
model_3 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_cbow_vs100_w4.model")
model_4 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_skipgram_vs100_w4.model")
model_5 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_cbow_vs300_w2.model")
model_6 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_skipgram_vs300_w2.model")
model_7 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_cbow_vs300_w4.model")
model_8 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/lemmatized_model_skipgram_vs300_w4.model")
model_9  = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_cbow_vs100_w2.model")
model_10 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_skipgram_vs100_w2.model")
model_11 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_cbow_vs100_w4.model")
model_12 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_skipgram_vs100_w4.model")
model_13 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_cbow_vs300_w2.model")
model_14 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_skipgram_vs300_w2.model")
model_15 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_cbow_vs300_w4.model")
model_16 = Word2Vec.load("C:/Users/yunus/Desktop/Bahar Dersleri/Yapay Zeka Proje/arac-arizadeseni-eslestirme/notebooks/stemmed_model_skipgram_vs300_w4.model")

# 'car' kelimesi ile en benzer 3 kelimeyi ve skorlarını yazdırmak
def print_similar_words(model, model_name):
    similarity = model.wv.most_similar('car', topn=3)
    print(f"\n{model_name} Modeli - 'car' ile En Benzer 3 Kelime:")
    for word, score in similarity:
        print(f"Kelime: {word}, Benzerlik Skoru: {score}")
        
# 16 model için benzer kelimeleri yazdır
print_similar_words(model_1, "Lemmatized CBOW Window 2 Dim 100")
print_similar_words(model_2, "Stemmed Skipgram Window 4 Dim 100")
print_similar_words(model_3, "Lemmatized Skipgram Window 2 Dim 300")
print_similar_words(model_4, "lemmatized skipgram window4 dim 100")
print_similar_words(model_5, "lemmatized cbow window2 dim 300")
print_similar_words(model_6, "lemmatizedskipgramwindow2dim300")
print_similar_words(model_7, "lemmatized_cbow_window4_dim300")
print_similar_words(model_8, "lemmatized_skipgram_window4_dim300.model")
print_similar_words(model_9, "stemmed_cbow_window2_dim100")
print_similar_words(model_10, "stemmed_skipgram_window2_dim100")
print_similar_words(model_11, "stemmed_cbow_window4_dim100")
print_similar_words(model_12, "stemmed_skipgram_window4_dim100")
print_similar_words(model_13, "stemmed_cbow_window2_dim300")
print_similar_words(model_14, "stemmed_skipgram_window2_dim300")
print_similar_words(model_15, "stemmed_cbow_window4_dim300")
print_similar_words(model_16, "stemmed_skipgram_window4_dim300")


Lemmatized CBOW Window 2 Dim 100 Modeli - 'car' ile En Benzer 3 Kelime:
Kelime: vehicle, Benzerlik Skoru: 0.9996932744979858
Kelime: but, Benzerlik Skoru: 0.9995226860046387
Kelime: and, Benzerlik Skoru: 0.9995170831680298

Stemmed Skipgram Window 4 Dim 100 Modeli - 'car' ile En Benzer 3 Kelime:
Kelime: vehicle, Benzerlik Skoru: 0.9973431825637817
Kelime: truck, Benzerlik Skoru: 0.9946492910385132
Kelime: in, Benzerlik Skoru: 0.994309663772583

Lemmatized Skipgram Window 2 Dim 300 Modeli - 'car' ile En Benzer 3 Kelime:
Kelime: vehicle, Benzerlik Skoru: 0.9998424649238586
Kelime: and, Benzerlik Skoru: 0.9997907280921936
Kelime: from, Benzerlik Skoru: 0.9997629523277283

lemmatized skipgram window4 dim 100 Modeli - 'car' ile En Benzer 3 Kelime:
Kelime: time, Benzerlik Skoru: 0.9910517930984497
Kelime: first, Benzerlik Skoru: 0.9907370805740356
Kelime: vehicle, Benzerlik Skoru: 0.9885488748550415

lemmatized cbow window2 dim 300 Modeli - 'car' ile En Benzer 3 Kelime:
Kelime: and, Benzerl