In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def preprocess_sentences(text):
    sentences = sent_tokenize(text)
    lemmatized_sentences = []
    stemmed_sentences = []
    stop_words = set(stopwords.words('turkish'))  # Türkçe stopwords
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        # Stopwords'ü çıkarıyoruz
        filtered_tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
        lemmatized = [lemmatizer.lemmatize(w) for w in filtered_tokens]
        stemmed = [stemmer.stem(w) for w in filtered_tokens]

        lemmatized_sentences.append(lemmatized)
        stemmed_sentences.append(stemmed)

    return lemmatized_sentences, stemmed_sentences

kuran_text = load_text("kuran.txt")
incil_text = load_text("incil.txt")

kuran_lemma_sentences, kuran_stem_sentences = preprocess_sentences(kuran_text)
incil_lemma_sentences, incil_stem_sentences = preprocess_sentences(incil_text)

df_kuran = pd.DataFrame({
    'Lemmatized': [' '.join(s) for s in kuran_lemma_sentences],
    'Stemmed': [' '.join(s) for s in kuran_stem_sentences]
})

df_incil = pd.DataFrame({
    'Lemmatized': [' '.join(s) for s in incil_lemma_sentences],
    'Stemmed': [' '.join(s) for s in incil_stem_sentences]
})

df_kuran.to_csv("kuran_stopwords_cumleler.csv", index=False, encoding='utf-8')
df_incil.to_csv("incil_stopwords_cumleler.csv", index=False, encoding='utf-8')

print("✅ Stopwords çıkarılmış CSV dosyaları başarıyla kaydedildi: kuran_stopwords_cumleler.csv ve incil_stopwords_cumleler.csv")


In [None]:
!pip install scikit-learn

In [None]:

def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

kuran_text = load_text("kuran.txt")
incil_text = load_text("incil.txt")

print("Kur'an Metni Örneği:", kuran_text[:500])  # İlk 500 karakteri yazdır
print("İncil Metni Örneği:", incil_text[:500])  # İlk 500 karakteri yazdır


In [None]:
import pandas as pd

kuran_df = pd.read_csv('kuran_cumleler1.csv', encoding='utf-8')
incil_df = pd.read_csv('incil_cumleler1.csv', encoding='utf-8')

print("Kuran Sütunları:", kuran_df.columns)
print("İncil Sütunları:", incil_df.columns)


In [None]:
import pandas as pd

kuran_df = pd.read_csv("kuran_cumleler1.csv", encoding="utf-8")
incil_df = pd.read_csv("incil_cumleler1.csv", encoding="utf-8")

kuran_df = kuran_df.dropna(subset=['Lemmatized'])
incil_df = incil_df.dropna(subset=['Lemmatized'])


In [None]:
import pandas as pd
import gzip
import os
from sklearn.feature_extraction.text import TfidfVectorizer

kuran_df = pd.read_csv("kuran_cumleler1.csv", encoding="utf-8")
incil_df = pd.read_csv("incil_cumleler1.csv", encoding="utf-8")

kuran_df = kuran_df.dropna(subset=['Lemmatized'])
incil_df = incil_df.dropna(subset=['Lemmatized'])

kuran_sentences = kuran_df['Lemmatized'].tolist()
incil_sentences = incil_df['Lemmatized'].tolist()

vectorizer_kuran = TfidfVectorizer()
vectorizer_incil = TfidfVectorizer()

tfidf_kuran = vectorizer_kuran.fit_transform(kuran_sentences)
tfidf_incil = vectorizer_incil.fit_transform(incil_sentences)

output_directory = "D:/"

def save_sparse_tfidf_gz(matrix, file_path):
    with gzip.open(file_path, 'wt', encoding='utf-8') as f:
        for i, row in enumerate(matrix):
            coo = row.tocoo()
            items = [f"{col}:{val:.6f}" for col, val in zip(coo.col, coo.data)]
            f.write(' '.join(items) + '\n')

kuran_file = os.path.join(output_directory, 'kuran_tfidf_sparse.csv.gz')
incil_file = os.path.join(output_directory, 'incil_tfidf_sparse.csv.gz')

save_sparse_tfidf_gz(tfidf_kuran, kuran_file)
save_sparse_tfidf_gz(tfidf_incil, incil_file)

print("Dosyalar başarıyla kaydedildi:")
print(f"Kuran: {kuran_file}")
print(f"İncil: {incil_file}")


In [None]:
from gensim.models import Word2Vec
import os

datasets = {
    'kuran': kuran_df,
    'incil': incil_df
}
columns = ['Lemmatized', 'Stemmed']
output_dir = "D:/"

for dataset_name, df in datasets.items():
    for column in columns:

        sentences = df[column].dropna().apply(lambda x: x.split()).tolist()

        for i in range(1, 9):  
            model_name = f"{dataset_name}_{column.lower()}_w2v_model_{i}.model"
            model_path = os.path.join(output_dir, model_name)

            print(f"{model_name} eğitiliyor...")

            model = Word2Vec(
                sentences,
                vector_size=100, 
                window=5,
                min_count=1,
                workers=4,
                epochs=10
            )

            model.save(model_path)
            print(f"{model_name} kaydedildi ✅")


ImportError: cannot import name 'triu' from 'scipy.linalg.special_matrices' (C:\Users\veyse\AppData\Local\Programs\Python\Python313\Lib\site-packages\scipy\linalg\special_matrices.py)