In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Dosyadan veri okuma
with open("sanatcilar_biyografi.txt", "r", encoding="utf-8") as file:
    text = file.read()  # Dosyanın tüm içeriğini al

# Cümlelere ayırma
sentences = sent_tokenize(text)

# Lemmatizer ve Stemmer'ı başlat
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Stopwords listesini almak
nltk.download('stopwords')  # Eğer stopwords yüklenmediyse
stop_words = set(stopwords.words('english'))  # Eğer metin Türkçe ise 'turkish' kullan

# Kelimeleri tokenleştirip, lemmatize etme ve stemleme
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)  # Cümleyi kelimelere ayır
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]  # Lemmatize etme
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]  # Stemleme
    
    return lemmatized_tokens, stemmed_tokens

# Her cümleyi tokenleştir, lemmatize et ve stemle
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

for sentence in sentences:
    lemmatized_tokens, stemmed_tokens = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lemmatized_tokens)
    tokenized_corpus_stemmed.append(stemmed_tokens)

# İlk 5 cümleyi yazdıralım
for i in range(min(5, len(tokenized_corpus_lemmatized))):
    print(f"Cümle {i+1} - Lemmatized: {tokenized_corpus_lemmatized[i]}")
    print(f"Cümle {i+1} - Stemmed: {tokenized_corpus_stemmed[i]}")
    print("\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\esra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cümle 1 - Lemmatized: ['frida', 'kahlo', 'magdalena', 'carmen', 'frida', 'kahlo', 'calderón', 'spanish', 'pronunciation', 'ˈfɾiða', 'ˈkalo', 'july', 'july', 'mexican', 'painter', 'known', 'many', 'portrait', 'work', 'inspired', 'nature', 'artifact', 'mexico']
Cümle 1 - Stemmed: ['frida', 'kahlo', 'magdalena', 'carmen', 'frida', 'kahlo', 'calderón', 'spanish', 'pronunci', 'ˈfɾiða', 'ˈkalo', 'juli', 'juli', 'mexican', 'painter', 'known', 'mani', 'portrait', 'work', 'inspir', 'natur', 'artifact', 'mexico']


Cümle 2 - Lemmatized: ['inspired', 'country', 'popular', 'culture', 'employed', 'naïve', 'folk', 'art', 'style', 'explore', 'question', 'identity', 'postcolonialism', 'gender', 'class', 'race', 'mexican', 'society']
Cümle 2 - Stemmed: ['inspir', 'countri', 'popular', 'cultur', 'employ', 'naïv', 'folk', 'art', 'style', 'explor', 'question', 'ident', 'postcoloni', 'gender', 'class', 'race', 'mexican', 'societi']


Cümle 3 - Lemmatized: ['painting', 'often', 'strong', 'autobiographical',

In [3]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# 📌 Wikipedia yerine sanatçı biyografi dosyasını okuyoruz
with open("sanatcilar_biyografi.txt", "r", encoding="utf-8") as file:
    text = file.read()  # Dosyanın tüm içeriğini al

# Cümlelere ayırma
sentences = sent_tokenize(text)

# Lemmatizer ve Stemmer'ı başlat
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Stopwords listesini almak
nltk.download('stopwords')  # Eğer yüklenmemişse
stop_words = set(stopwords.words('english'))  # Türkçe için 'turkish' kullanabilirsin

# 🔹 Kelimeleri işleme fonksiyonu
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)  # Cümleyi kelimelere ayır
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]  # Lemmatize etme
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]  # Stemleme
    
    return lemmatized_tokens, stemmed_tokens

# 🔹 Metni işleyerek tokenizasyon yapıyoruz
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

for sentence in sentences:
    lemmatized_tokens, stemmed_tokens = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lemmatized_tokens)
    tokenized_corpus_stemmed.append(stemmed_tokens)

# 📌 Word2Vec modeli eğitmek için parametreler
parameters = [
    {'model_type': 'cbow', 'window': 2, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 2, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 300},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 300}
]

# 🔹 Word2Vec modelini eğitme ve kaydetme fonksiyonu
def train_and_save_model(corpus, params, model_name):
    model = Word2Vec(corpus, vector_size=params['vector_size'], window=params['window'], min_count=1, sg=1 if params['model_type'] == 'skipgram' else 0)
    model.save(f"{model_name}_{params['model_type']}_window{params['window']}_dim{params['vector_size']}.model")
    print(f"{model_name}_{params['model_type']}_window{params['window']}_dim{params['vector_size']} model saved!")

# 📌 Lemmatize edilmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_lemmatized, param, "lemmatized_model")

# 📌 Stemlenmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_stemmed, param, "stemmed_model")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\esra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


lemmatized_model_cbow_window2_dim100 model saved!
lemmatized_model_skipgram_window2_dim100 model saved!
lemmatized_model_cbow_window4_dim100 model saved!
lemmatized_model_skipgram_window4_dim100 model saved!
lemmatized_model_cbow_window2_dim300 model saved!
lemmatized_model_skipgram_window2_dim300 model saved!
lemmatized_model_cbow_window4_dim300 model saved!
lemmatized_model_skipgram_window4_dim300 model saved!
stemmed_model_cbow_window2_dim100 model saved!
stemmed_model_skipgram_window2_dim100 model saved!
stemmed_model_cbow_window4_dim100 model saved!
stemmed_model_skipgram_window4_dim100 model saved!
stemmed_model_cbow_window2_dim300 model saved!
stemmed_model_skipgram_window2_dim300 model saved!
stemmed_model_cbow_window4_dim300 model saved!
stemmed_model_skipgram_window4_dim300 model saved!
