In [None]:
import pandas as pd

# Veri setini okuma
df = pd.read_excel("/content/zuli003.xlsx")

# Kelime sözlüğünü oluşturma
corpus = df['A'].str.cat(sep=' ')
word_list = corpus.split()
word_set = set(word_list)
word_dict = {word: index for index, word in enumerate(word_set)}
words = list(word_set)

# Bag of Words temsilini oluşturma
bow_representation = pd.DataFrame(columns=words)

for i, text in enumerate(df['A']):
    word_count = {word: 0 for word in words}
    for word in text.split():
        word_count[word] += 1
    bow_representation.loc[i] = word_count

# Etiketleri saklama
bow_representation['etiket'] = df['B']

#bow_representation.to_excel("bow_representation.xlsx", index=False)

In [None]:
print(bow_representation.head())

   değil  avrupa  50  2022  arap  29  müdür  reddet  2  aile  ...  al  ulus  \
0      0       1   0     0     0   0      0       1  0     0  ...   0     0   
1      0       0   0     0     0   0      0       0  0     0  ...   0     0   
2      0       0   0     0     0   0      0       0  0     0  ...   0     0   
3      0       0   0     1     0   0      0       0  0     0  ...   0     0   
4      0       0   1     0     0   0      1       0  0     1  ...   0     0   

   polis  göre  program  0000  öjenik  para  vatan  etiket  
0      1     0        0     0       0     0      0       0  
1      0     0        0     1       0     0      0       0  
2      0     1        0     0       0     0      0       0  
3      0     0        1     0       0     0      0       0  
4      0     0        0     0       0     1      0       1  

[5 rows x 92 columns]


In [None]:
import numpy as np

# Co-occurrence matrisini hesaplama
co_occurrence_matrix = np.zeros((len(words), len(words)), dtype=int)

for text in df['A']:
    word_indices = [word_dict[word] for word in text.split() if word in word_dict]
    for i in range(len(word_indices)):
        for j in range(len(word_indices)):
            co_occurrence_matrix[word_indices[i]][word_indices[j]] += 1

# Co-occurrence matrisini kaydetme
#np.savetxt("co_occurrence_matrix.csv", co_occurrence_matrix, delimiter=",")

print(co_occurrence_matrix)


[[1 0 0 ... 0 0 1]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 1 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 1 ... 0 1 0]
 [1 0 0 ... 0 0 1]]


In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

# Metinleri ve etiketleri ayırma
texts = df['A']
labels = df['B']

# HashingVectorizer'ı oluşturma ve uygulama
vectorizer = HashingVectorizer(n_features=1000)  # n_features, vektör boyutunu belirler
hashed_features = vectorizer.transform(texts)

# Elde edilen vektörleri bir DataFrame'e dönüştürme
hashed_df = pd.DataFrame(hashed_features.toarray())

# Etiketleri DataFrame'e ekleme
hashed_df['etiket'] = labels

# Sonucu kaydetme
#hashed_df.to_csv("hashed_representation.csv", index=False)
print(hashed_df.head())

     0    1    2    3    4    5    6    7    8    9  ...  991  992  993  994  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   995  996  997       998  999  etiket  
0  0.0  0.0  0.0  0.000000  0.0       0  
1  0.0  0.0  0.0  0.000000  0.0       0  
2  0.0  0.0  0.0  0.182574  0.0       0  
3  0.0  0.0  0.0  0.000000  0.0       0  
4  0.0  0.0  0.0  0.000000  0.0       1  

[5 rows x 1001 columns]


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Etiketleri ayırma
labels = df['B']

# OneHotEncoder'ı oluşturma ve uygulama
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(labels.values.reshape(-1, 1))

# One-hot encoding sonuçlarını DataFrame'e dönüştürme
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out())

# Sonucu görüntüleme
print(encoded_df)

   x0_0  x0_1
0   1.0   0.0
1   1.0   0.0
2   1.0   0.0
3   1.0   0.0
4   0.0   1.0
5   1.0   0.0
6   1.0   0.0
7   1.0   0.0
8   1.0   0.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Metinleri ayırma
texts = df['A']

# TF-IDF vektörleştiriciyi oluşturma ve uygulama
vectorizer = TfidfVectorizer()
tfidf_features = vectorizer.fit_transform(texts)

# TF-IDF sonuçlarını DataFrame'e dönüştürme
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())

# Sonucu görüntüleme
print(tfidf_df.head())

       0000  05000630        12  15001630      2022   29        50      aihm  \
0  0.000000  0.000000  0.000000       0.0  0.000000  0.0  0.000000  0.453831   
1  0.355955  0.355955  0.000000       0.0  0.000000  0.0  0.000000  0.000000   
2  0.000000  0.000000  0.188044       0.0  0.000000  0.0  0.000000  0.000000   
3  0.000000  0.000000  0.000000       0.0  0.350951  0.0  0.000000  0.000000   
4  0.000000  0.000000  0.000000       0.0  0.000000  0.0  0.323774  0.000000   

       aile   al  ...  zannet  ziya    çelenk     çocuk  çık  öjenik  \
0  0.000000  0.0  ...     0.0   0.0  0.000000  0.000000  0.0     0.0   
1  0.000000  0.0  ...     0.0   0.0  0.000000  0.000000  0.0     0.0   
2  0.000000  0.0  ...     0.0   0.0  0.000000  0.564131  0.0     0.0   
3  0.000000  0.0  ...     0.0   0.0  0.000000  0.000000  0.0     0.0   
4  0.323774  0.0  ...     0.0   0.0  0.323774  0.000000  0.0     0.0   

    öğrenci     öğret       ırk     şekil  
0  0.000000  0.000000  0.147235  0.000000 