Kaynak: Dr. Deniz Kılınç

Kaynak Kod: https://github.com/denopas/TextProcessing

In [1]:
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 8000
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kodiks/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# TR: Örnek Türkçe dokümanlar 
# EN: Sample documents in Turkish
docs = ['Açıklama projenin ortaklarından Rus enerji devi Gazprom dan geldi. Yıllık 63 milyar metreküp enerji',
        'ilk günündeki 20 yarış heyecanlıydı, 109 puan toplayan Türkiye, 12 ülke arasında 9. oldu ve yarış tamamlandı',
        'Cortananın yeni işletim sistemi Windows 10 un önemli bir parçası olduğunu belirten Microsoft ; Google Android ve iOS cihazlarındaki Dijital',
        'Teknoloji devi Google, Android in MMM sürümüyle birlikte bir çok sistemsel hatasının düzeltileceğini',
        'Siroz hastalığı ile ilgili detaylara dikkat çekerek, sağlıklı bir karaciğere sahip olmak hastalık için',
        'Hastalık çoğu kez yıllarca doğru tanı konmaması veya ciddiye alınmaması sebebi ile kısırlaştırıcı etki yapabiliyor, kronik ağrı,',
        'ilk 4 etaptan galibiyetle ayrılan 18 yaşındaki Razgatlıoğlu, Almanya daki yarışta 3. sırayı alarak ',
        'Helal gıda pazarı sanki 860 milyar doların üzerinde'    
]

In [3]:
WPT = nltk.WordPunctTokenizer()
stop_word_list = nltk.corpus.stopwords.words('turkish')

stop_word_list

['acaba',
 'ama',
 'aslında',
 'az',
 'bazı',
 'belki',
 'biri',
 'birkaç',
 'birşey',
 'biz',
 'bu',
 'çok',
 'çünkü',
 'da',
 'daha',
 'de',
 'defa',
 'diye',
 'eğer',
 'en',
 'gibi',
 'hem',
 'hep',
 'hepsi',
 'her',
 'hiç',
 'için',
 'ile',
 'ise',
 'kez',
 'ki',
 'kim',
 'mı',
 'mu',
 'mü',
 'nasıl',
 'ne',
 'neden',
 'nerde',
 'nerede',
 'nereye',
 'niçin',
 'niye',
 'o',
 'sanki',
 'şey',
 'siz',
 'şu',
 'tüm',
 've',
 'veya',
 'ya',
 'yani']

In [4]:
def norm_doc(single_doc):
    # TR: Dokümandan belirlenen özel karakterleri ve sayıları at
    # EN: Remove special characters and numbers
    single_doc = re.sub(" \d+", " ", single_doc)
    pattern = r"[{}]".format(",.;") 
    single_doc = re.sub(pattern, "", single_doc) 
    # TR: Dokümanı küçük harflere çevir
    # EN: Convert document to lowercase
    single_doc = single_doc.lower()
    single_doc = single_doc.strip()
    # TR: Dokümanı token'larına ayır
    # EN: Tokenize documents
    tokens = WPT.tokenize(single_doc)
    # TR: Stop-word listesindeki kelimeler hariç al
    # EN: Filter out the stop-words 
    filtered_tokens = [token for token in tokens if token not in stop_word_list]
    # TR: Dokümanı tekrar oluştur
    # EN: Reconstruct the document
    single_doc = ' '.join(filtered_tokens)
    return single_doc

norm_docs = np.vectorize(norm_doc) #like magic :)
normalized_documents = norm_docs(docs)
print(normalized_documents)

['açıklama projenin ortaklarından rus enerji devi gazprom dan geldi yıllık milyar metreküp enerji'
 'ilk günündeki yarış heyecanlıydı puan toplayan türkiye ülke arasında oldu yarış tamamlandı'
 'cortananın yeni işletim sistemi windows un önemli bir parçası olduğunu belirten microsoft google android ios cihazlarındaki dijital'
 'teknoloji devi google android in mmm sürümüyle birlikte bir sistemsel hatasının düzeltileceğini'
 'siroz hastalığı ilgili detaylara dikkat çekerek sağlıklı bir karaciğere sahip olmak hastalık'
 'hastalık çoğu yıllarca doğru tanı konmaması ciddiye alınmaması sebebi kısırlaştırıcı etki yapabiliyor kronik ağrı'
 'ilk etaptan galibiyetle ayrılan yaşındaki razgatlıoğlu almanya daki yarışta sırayı alarak'
 'helal gıda pazarı milyar doların üzerinde']


In [5]:
# TR: 1.Terim Sayma Adımları
# EN: 1.Term Counting Steps
from sklearn.feature_extraction.text import CountVectorizer

BoW_Vector = CountVectorizer(min_df = 0., max_df = 1.)
BoW_Matrix = BoW_Vector.fit_transform(normalized_documents)
#print (BoW_Matrix)

In [6]:
BoW_Matrix

<8x87 sparse matrix of type '<class 'numpy.int64'>'
	with 95 stored elements in Compressed Sparse Row format>

In [8]:
features = BoW_Vector.get_feature_names()

print(len(features))

87


In [9]:
features[:10]

['alarak',
 'almanya',
 'alınmaması',
 'android',
 'arasında',
 'ayrılan',
 'açıklama',
 'ağrı',
 'belirten',
 'bir']

In [40]:
# TR: BoW_Vector içerisindeki tüm öznitelikleri al
# EN: Fetch al features in BoW_Vector
features = BoW_Vector.get_feature_names()
print ("features[0]:" + features[0])
print ("features[10]:" +features[10])

BoW_Matrix = BoW_Matrix.toarray()
print(BoW_Matrix)
# TR: Doküman - öznitelik matrisini göster
# EN: Print document by term matrice
BoW_df = pd.DataFrame(BoW_Matrix, columns = features)
BoW_df
#print(BoW_df.info())

features[0]:alarak
features[10]:birlikte
[[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 2 0 0 0 1 1 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1
  1 0 0 0 2 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 1 1 0 0 0 0 1 0 0 0 0 1 0 0]
 [0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
  0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 

Unnamed: 0,alarak,almanya,alınmaması,android,arasında,ayrılan,açıklama,ağrı,belirten,bir,...,yarışta,yaşındaki,yeni,yıllarca,yıllık,çekerek,çoğu,önemli,ülke,üzerinde
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,1,1,...,0,0,1,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
5,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
6,1,1,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# TR: 2.TFxIdf Hesaplama Adımları
# EN: 2.TFxIdf Calculation Steps
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf_Vector = TfidfVectorizer(min_df = 0., max_df = 1., use_idf = True)

Tfidf_Matrix = Tfidf_Vector.fit_transform(normalized_documents)
Tfidf_Matrix = Tfidf_Matrix.toarray()
print(np.round(Tfidf_Matrix, 3))
# TR: Tfidf_Vector içerisindeki tüm öznitelikleri al
# EN: Fetch al features in Tfidf_Vector
features = Tfidf_Vector.get_feature_names()
# TR: Doküman - öznitelik matrisini göster
# EN: Print document by term matrice
Tfidf_df = pd.DataFrame(np.round(Tfidf_Matrix, 3), columns = features)
Tfidf_df

[[0.    0.    0.    0.    0.    0.    0.263 0.    0.    0.    0.    0.
  0.    0.    0.    0.263 0.    0.221 0.    0.    0.    0.    0.    0.527
  0.    0.    0.    0.263 0.263 0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.263 0.
  0.221 0.    0.    0.    0.    0.263 0.    0.    0.263 0.    0.    0.263
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.263 0.    0.
  0.    0.    0.   ]
 [0.    0.    0.    0.    0.27  0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.27  0.    0.    0.    0.    0.
  0.27  0.    0.226 0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.27  0.    0.    0.    0.    0.    0.    0.27  0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.27  0.    0.    0.27
  0.27  0.    0.    0.    0.54  0.    0.    0.  

Unnamed: 0,alarak,almanya,alınmaması,android,arasında,ayrılan,açıklama,ağrı,belirten,bir,...,yarışta,yaşındaki,yeni,yıllarca,yıllık,çekerek,çoğu,önemli,ülke,üzerinde
0,0.0,0.0,0.0,0.0,0.0,0.0,0.263,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.263,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0
2,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.251,0.181,...,0.0,0.0,0.251,0.0,0.0,0.0,0.0,0.251,0.0,0.0
3,0.0,0.0,0.0,0.257,0.0,0.0,0.0,0.0,0.0,0.222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216,...,0.0,0.0,0.0,0.0,0.0,0.298,0.0,0.0,0.0,0.0
5,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.27,0.0,0.0,...,0.0,0.0,0.0,0.27,0.0,0.0,0.27,0.0,0.0,0.0
6,0.306,0.306,0.0,0.0,0.0,0.306,0.0,0.0,0.0,0.0,...,0.306,0.306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419
