## Turkish NLP Zemberek

In [1]:
import jpype
# JVM başlat
jpype.startJVM(jpype.getDefaultJVMPath(),
         "-Djava.class.path=/Users/uzaycetin/Documents/driver/zemberek-tum-2.0.jar", "-ea")

# Türkiye Türkçesine göre çözümlemek için gerekli sınıfı hazırla
Tr = jpype.JClass("net.zemberek.tr.yapi.TurkiyeTurkcesi")
# tr nesnesini oluştur
tr = Tr()
# Zemberek sınıfını yükle
Zemberek = jpype.JClass("net.zemberek.erisim.Zemberek")
# zemberek nesnesini oluştur
zemberek = Zemberek(tr)

In [2]:
def kokbul(word = 'karasal'):
    kok = word
    ornSonuc = zemberek.kelimeCozumle(kok)
    try:
        kok = ornSonuc[0].kok().icerik()
    except:
        pass
    return kok

In [3]:
txt = 'gezinti gelsene'
[kokbul(w) for w in txt.split()]

['gezinti', 'gel']

In [4]:
# Importing the libraries
import numpy as np
import pandas as pd
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

# stopwordsleri sil
from nltk.corpus import stopwords
stop = stopwords.words('Turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/uzaycetin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Get twitter 17000 Data

In [5]:
# Turkish Stopwords
with open('17bintweet/turkce-stop-words.txt') as file:  
    stw = file.read() 
stw = stw.split()
stw = [s.lower() for s in stw] 
stop += stw

In [6]:
def preprocessing(text):
    text = text.lower()
    # get rid of non-alphanumerical characters
    text = re.sub(r'\W', ' ', text) 
    # get rid of spaces
    text = re.sub(r'\s+', ' ', text) 
    # Correct mistakes 
    # zemberek!!
    
    # and do the stemming
    kelimeler = [kokbul(w) for w in text.split()]
    return " ".join([word for word in kelimeler if word not in stop])

In [7]:
preprocessing(txt)

'gezinti gel'

In [8]:
test_data = pd.read_excel('17bintweet/test_tweets.xlsx', header=None)
test_data.head(3)

Unnamed: 0,0,1
0,Turkcell'e kızgınım. Ve bu kızgınlık sanırım a...,olumsuz
1,turkcell kadar şerefsiz misiniz ya,olumsuz
2,Burdan Turkcell'e sesleniyorum o 3 tl haram olsun,olumsuz


In [9]:
train_data = pd.read_excel('17bintweet/train_tweets.xlsx', header=None)
train_data.head(3)

Unnamed: 0,0,1
0,Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Turkc...,olumsuz
1,20 dk 1 GB internet 500 mb sadece kaşar turkce...,olumsuz
2,Ayrıca turkcell superonline reklamı kadar da k...,olumsuz


In [10]:
test_data[0] = test_data[0].apply(preprocessing)
train_data[0] = train_data[0].apply(preprocessing)

train_data.head(3)

Unnamed: 0,0,1
0,ulan wifi bağ bağ turkcell internet paket bit ...,olumsuz
1,20 dk 1 gb internet 500 mb sadece kaşar turkce...,olumsuz
2,turkcell superonline reklam kötü reklam gör,olumsuz


## Doc2Vec starts

In [11]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

## Label documents

In [12]:
# Creating the corpus for train_data
train_tagged_data = []
for i in range(len(train_data)):
    train_tagged_data.append(TaggedDocument(word_tokenize(train_data.iloc[i,0]), tags=[train_data.iloc[i,1]]))

In [13]:
# Creating the corpus for test_data
test_tagged_data = []
for i in range(len(test_data)):
    test_tagged_data.append(TaggedDocument(word_tokenize(test_data.iloc[i,0]), tags=[test_data.iloc[i,1]]))

In [14]:
train_tagged_data[:3]

[TaggedDocument(words=['ulan', 'wifi', 'bağ', 'bağ', 'turkcell', 'internet', 'paket', 'bit', 'mesaj', 'at', 'öde'], tags=['olumsuz']),
 TaggedDocument(words=['20', 'dk', '1', 'gb', 'internet', '500', 'mb', 'sadece', 'kaşar', 'turkcell', 'düş', 'oç', 'çocuk'], tags=['olumsuz']),
 TaggedDocument(words=['turkcell', 'superonline', 'reklam', 'kötü', 'reklam', 'gör'], tags=['olumsuz'])]

In [15]:
test_tagged_data[:3]

[TaggedDocument(words=['turkcell', 'kızgın', 'kızgın', 'san', 'ayrıl', 'sonlanıcak', 'gel', 'fark', 'operatör', '30', 'fazla', 'fiyat', 'teklif', 'et'], tags=['olumsuz']),
 TaggedDocument(words=['turkcell', 'şeref', 'mis'], tags=['olumsuz']),
 TaggedDocument(words=['burdan', 'turkcell', 'sesle', '3', 'tl', 'haram', 'ol'], tags=['olumsuz'])]

In [16]:
len(test_tagged_data), len(train_tagged_data)

(3457, 13832)

In [17]:
train_tagged_data[0].words

['ulan',
 'wifi',
 'bağ',
 'bağ',
 'turkcell',
 'internet',
 'paket',
 'bit',
 'mesaj',
 'at',
 'öde']

## Train doc2vec dm=1 ‘distributed memory’ (PV-DM) 

In [18]:
max_epochs = 40
model = Doc2Vec(dm_mean=1, 
                vector_size=300, 
                negative=5, 
                workers=3,
                window=5,
                alpha=0.065, 
                min_alpha=0.065,
                min_count=2,
                dm =1)
  
model.build_vocab(train_tagged_data)

for epoch in range(max_epochs):
    if epoch% (max_epochs/5) == 0: print('iteration {0}/{1}'.format(epoch, max_epochs))
        
    model.train(train_tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    
    # decrease the learning rate
    model.alpha -= 0.002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#model.save("d2vz.model")
print("Model Saved")

iteration 0/40




iteration 8/40
iteration 16/40
iteration 24/40
iteration 32/40
Model Saved


In [19]:
len(model.wv.vocab)

7131

In [20]:
len(model.docvecs) # The number of trained document tags is available from:

3

In [21]:
model.wv.vocab

{'ulan': <gensim.models.keyedvectors.Vocab at 0x63b5843c8>,
 'wifi': <gensim.models.keyedvectors.Vocab at 0x63b5845f8>,
 'bağ': <gensim.models.keyedvectors.Vocab at 0x63b584630>,
 'turkcell': <gensim.models.keyedvectors.Vocab at 0x63b584400>,
 'internet': <gensim.models.keyedvectors.Vocab at 0x63b5845c0>,
 'paket': <gensim.models.keyedvectors.Vocab at 0x63b584668>,
 'bit': <gensim.models.keyedvectors.Vocab at 0x63b584588>,
 'mesaj': <gensim.models.keyedvectors.Vocab at 0x63b5846a0>,
 'at': <gensim.models.keyedvectors.Vocab at 0x63b5846d8>,
 'öde': <gensim.models.keyedvectors.Vocab at 0x63b584710>,
 '20': <gensim.models.keyedvectors.Vocab at 0x63b584748>,
 'dk': <gensim.models.keyedvectors.Vocab at 0x63b584780>,
 '1': <gensim.models.keyedvectors.Vocab at 0x63b5847b8>,
 'gb': <gensim.models.keyedvectors.Vocab at 0x63b5847f0>,
 '500': <gensim.models.keyedvectors.Vocab at 0x63b584828>,
 'mb': <gensim.models.keyedvectors.Vocab at 0x63b584860>,
 'sadece': <gensim.models.keyedvectors.Vocab at

In [34]:
model.most_similar('turkcell')

  """Entry point for launching an IPython kernel.


[('anlatiyor', 0.23928648233413696),
 ('ol', 0.22458797693252563),
 ('sak', 0.2237774282693863),
 ('by', 0.22273793816566467),
 ('odunc', 0.22208234667778015),
 ('turkcellakillikadinlarkulubu', 0.22112055122852325),
 ('öbürülaöbürü', 0.219574436545372),
 ('escobar', 0.21668019890785217),
 ('eyjwywdlijoiag9yb3njb3blxc9kzxrhawwilcjxcyi6imlkpteyjnr5cgu9myj9',
  0.21526719629764557),
 ('alirken', 0.21001583337783813)]

In [35]:
#to find the vector of a document which is not in training data
test_data = word_tokenize("2".lower())
v1 = model.infer_vector(test_data)
print(test_data)
print("V1_infer", v1)

['2']
V1_infer [-0.03737827 -0.08217598  0.05897779  0.04295481  0.04406261 -0.01183352
  0.0726761  -0.01144092  0.04881437 -0.00684601  0.04838407 -0.06048566
 -0.04084711  0.06238113  0.03074199  0.05869396 -0.02505491 -0.01250112
 -0.01824988  0.04761232 -0.09492981 -0.02362792 -0.09096128  0.08197915
  0.01207124  0.00329877  0.04168056  0.06442592 -0.03951911  0.04573657
 -0.07926989  0.02268605  0.06632239 -0.04250176 -0.03479092  0.12830956
  0.0067775  -0.0044645   0.13353355  0.0271051  -0.00940917  0.04173548
 -0.0339273  -0.02016051 -0.02423552 -0.08493184 -0.02600931  0.08246565
 -0.10996543  0.04070801 -0.02299405  0.05146294  0.11879726  0.03539389
 -0.09574382  0.00767232 -0.0229358   0.05753192 -0.02410503  0.0766781
 -0.01903024  0.04721741 -0.00343341 -0.03622256 -0.03278359  0.06925772
  0.0458094  -0.06590406  0.05477072  0.04022017  0.00090872  0.06538519
  0.05885214 -0.01943279 -0.04467616 -0.02703006  0.01983195  0.06660437
  0.0164433   0.04222489 -0.00774265 

## Load pre-trained model

In [36]:
#model= Doc2Vec.load("d2vz.model")

# Run Logistic Regression on vector represenations of words

In [37]:
def vec_for_learning(model, tagged_docs):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tagged_docs])
    return targets, regressors

In [38]:
y_train, X_train = vec_for_learning(model, train_tagged_data)
y_test, X_test = vec_for_learning(model, test_tagged_data)

In [39]:
sentiment = {'olumsuz':0, 'notr':1, 'olumlu':2}
inv_sentiment = {v:k for k, v in sentiment.items()}
inv_sentiment

{0: 'olumsuz', 1: 'notr', 2: 'olumlu'}

In [40]:
y_train = np.array([sentiment[s] for s in y_train])
y_test = np.array([sentiment[s] for s in y_test])
y_train

array([0, 0, 0, ..., 0, 1, 2])

In [41]:
y_test

array([0, 0, 0, ..., 1, 1, 2])

In [42]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.40873589817761063
Testing F1 score: 0.40445415788027606


In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[411, 622, 344],
       [229, 602, 333],
       [163, 353, 400]])

In [45]:
len(y_pred)

3457

## To-Do

Zemberek correct words