## Turkish NLP Zemberek

In [1]:
import jpype
# JVM başlat
jpype.startJVM(jpype.getDefaultJVMPath(),
         "-Djava.class.path=/Users/uzaycetin/Documents/driver/zemberek-tum-2.0.jar", "-ea")

# Türkiye Türkçesine göre çözümlemek için gerekli sınıfı hazırla
Tr = jpype.JClass("net.zemberek.tr.yapi.TurkiyeTurkcesi")
# tr nesnesini oluştur
tr = Tr()
# Zemberek sınıfını yükle
Zemberek = jpype.JClass("net.zemberek.erisim.Zemberek")
# zemberek nesnesini oluştur
zemberek = Zemberek(tr)

In [2]:
def kokbul(word = 'karasal'):
    kok = word
    ornSonuc = zemberek.kelimeCozumle(kok)
    try:
        kok = ornSonuc[0].kok().icerik()
    except:
        pass
    return kok

In [3]:
txt = 'gezinti gelsene'
[kokbul(w) for w in txt.split()]

['gezinti', 'gel']

In [4]:
# Importing the libraries
import numpy as np
import pandas as pd
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

# stopwordsleri sil
from nltk.corpus import stopwords
stop = stopwords.words('Turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/uzaycetin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Get twitter 17000 Data

In [18]:
# Turkish Stopwords
with open('17bintweet/turkce-stop-words.txt') as file:  
    stw = file.read() 
stw = stw.split()
stw = [s.lower() for s in stw] 
stop += stw

In [19]:
def preprocessing(text):
    text = text.lower()
    # get rid of non-alphanumerical characters
    text = re.sub(r'\W', ' ', text) 
    # get rid of spaces
    text = re.sub(r'\s+', ' ', text) 
    # Correct mistakes 
    # zemberek!!
    
    # and do the stemming
    kelimeler = [kokbul(w) for w in text.split()]
    return " ".join([word for word in kelimeler if word not in stop])

In [20]:
preprocessing(txt)

'gezinti gel'

In [21]:
test_data = pd.read_excel('17bintweet/test_tweets.xlsx', header=None)
test_data.head(3)

Unnamed: 0,0,1
0,Turkcell'e kızgınım. Ve bu kızgınlık sanırım a...,olumsuz
1,turkcell kadar şerefsiz misiniz ya,olumsuz
2,Burdan Turkcell'e sesleniyorum o 3 tl haram olsun,olumsuz


In [22]:
train_data = pd.read_excel('17bintweet/train_tweets.xlsx', header=None)
train_data.head(3)

Unnamed: 0,0,1
0,Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Turkc...,olumsuz
1,20 dk 1 GB internet 500 mb sadece kaşar turkce...,olumsuz
2,Ayrıca turkcell superonline reklamı kadar da k...,olumsuz


In [23]:
test_data[0] = test_data[0].apply(preprocessing)
train_data[0] = train_data[0].apply(preprocessing)

train_data.head(3)

Unnamed: 0,0,1
0,ulan wifi bağ bağ turkcell internet paket bit ...,olumsuz
1,20 dk 1 gb internet 500 mb sadece kaşar turkce...,olumsuz
2,turkcell superonline reklam kötü reklam gör,olumsuz


## Doc2Vec starts

In [24]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

## Label documents

In [25]:
# Creating the corpus for train_data
train_tagged_data = []
for i in range(len(train_data)):
    train_tagged_data.append(TaggedDocument(word_tokenize(train_data.iloc[i,0]), tags=[train_data.iloc[i,1]]))

In [26]:
# Creating the corpus for test_data
test_tagged_data = []
for i in range(len(test_data)):
    test_tagged_data.append(TaggedDocument(word_tokenize(test_data.iloc[i,0]), tags=[test_data.iloc[i,1]]))

In [27]:
train_tagged_data[:3]

[TaggedDocument(words=['ulan', 'wifi', 'bağ', 'bağ', 'turkcell', 'internet', 'paket', 'bit', 'mesaj', 'at', 'öde'], tags=['olumsuz']),
 TaggedDocument(words=['20', 'dk', '1', 'gb', 'internet', '500', 'mb', 'sadece', 'kaşar', 'turkcell', 'düş', 'oç', 'çocuk'], tags=['olumsuz']),
 TaggedDocument(words=['turkcell', 'superonline', 'reklam', 'kötü', 'reklam', 'gör'], tags=['olumsuz'])]

In [28]:
test_tagged_data[:3]

[TaggedDocument(words=['turkcell', 'kızgın', 'kızgın', 'san', 'ayrıl', 'sonlanıcak', 'gel', 'fark', 'operatör', '30', 'fazla', 'fiyat', 'teklif', 'et'], tags=['olumsuz']),
 TaggedDocument(words=['turkcell', 'şeref', 'mis'], tags=['olumsuz']),
 TaggedDocument(words=['burdan', 'turkcell', 'sesle', '3', 'tl', 'haram', 'ol'], tags=['olumsuz'])]

In [29]:
len(test_tagged_data), len(train_tagged_data)

(3457, 13832)

In [30]:
train_tagged_data[0].words

['ulan',
 'wifi',
 'bağ',
 'bağ',
 'turkcell',
 'internet',
 'paket',
 'bit',
 'mesaj',
 'at',
 'öde']

## Train doc2vec dm=1 ‘distributed memory’ (PV-DM) 

In [31]:
max_epochs = 20
vec_size = 200

model = Doc2Vec(size=vec_size,
                window=5,
                alpha=0.065, 
                min_alpha=0.065,
                min_count=2,
                dm =1)
  
model.build_vocab(train_tagged_data)

for epoch in range(max_epochs):
    if epoch% (max_epochs/5) == 0: print('iteration {0}/{1}'.format(epoch, max_epochs))
        
    model.train(train_tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    
    # decrease the learning rate
    model.alpha -= 0.002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#model.save("d2vz.model")
print("Model Saved")



iteration 0/20
iteration 4/20
iteration 8/20
iteration 12/20
iteration 16/20
Model Saved


In [32]:
len(model.wv.vocab)

3010

In [33]:
len(model.docvecs) # The number of trained document tags is available from:

3

In [34]:
model.wv.vocab

{'ulan': <gensim.models.keyedvectors.Vocab at 0x633d8b978>,
 'wifi': <gensim.models.keyedvectors.Vocab at 0x633d8b940>,
 'bağ': <gensim.models.keyedvectors.Vocab at 0x633d8ba20>,
 'turkcell': <gensim.models.keyedvectors.Vocab at 0x633d8bba8>,
 'internet': <gensim.models.keyedvectors.Vocab at 0x633d8b9e8>,
 'paket': <gensim.models.keyedvectors.Vocab at 0x633d8bbe0>,
 'bit': <gensim.models.keyedvectors.Vocab at 0x633d8bc18>,
 'mesaj': <gensim.models.keyedvectors.Vocab at 0x633d8bc50>,
 'at': <gensim.models.keyedvectors.Vocab at 0x633d8bc88>,
 'öde': <gensim.models.keyedvectors.Vocab at 0x633d8bcc0>,
 '20': <gensim.models.keyedvectors.Vocab at 0x633d8bcf8>,
 'dk': <gensim.models.keyedvectors.Vocab at 0x633d8bd30>,
 '1': <gensim.models.keyedvectors.Vocab at 0x633d8bd68>,
 'gb': <gensim.models.keyedvectors.Vocab at 0x633d8bda0>,
 '500': <gensim.models.keyedvectors.Vocab at 0x633d8bdd8>,
 'mb': <gensim.models.keyedvectors.Vocab at 0x633d8be10>,
 'sadece': <gensim.models.keyedvectors.Vocab at

In [37]:
model.most_similar('turkcell')

  """Entry point for launching an IPython kernel.


[('ol', 0.2846031188964844),
 ('kurt', 0.2792709469795227),
 ('ver', 0.2715378999710083),
 ('yap', 0.2708706259727478),
 ('http', 0.2701042890548706),
 ('ara', 0.26698189973831177),
 ('https', 0.26491039991378784),
 ('uzay', 0.2596946358680725),
 ('alkisliyorum', 0.24467743933200836),
 ('kısa', 0.24346014857292175)]

In [38]:
#to find the vector of a document which is not in training data
test_data = word_tokenize("2".lower())
v1 = model.infer_vector(test_data)
print(test_data)
print("V1_infer", v1)

['2']
V1_infer [-0.0658396   0.00154664  0.02480551 -0.06743868  0.04208385 -0.04714711
 -0.17452891 -0.01235577  0.00375876 -0.03324285  0.03270067  0.05474114
 -0.09169555 -0.05932066  0.0708919  -0.05236538  0.05560438 -0.01366046
  0.03248015 -0.10350001 -0.01048315 -0.1171841  -0.02561115  0.05914393
  0.0650729  -0.07150536  0.08035857  0.071274   -0.01021082  0.02001089
  0.06207767 -0.01640065 -0.27542648 -0.06176654  0.03634369 -0.07913997
 -0.03512963 -0.10112834  0.08481842 -0.0294768  -0.13642262  0.05355902
 -0.04676862  0.09340989  0.03122408  0.04879354 -0.00085117  0.20424935
 -0.11274365 -0.03550681  0.14094974  0.07018941  0.05713382 -0.01350317
  0.03761854  0.04132842 -0.0503211   0.09951602 -0.01948802  0.00539477
  0.02446734 -0.12434271  0.1400659  -0.08675782 -0.00951768  0.140519
  0.11797898 -0.04553182 -0.03896495  0.07474744 -0.0650594   0.03321582
 -0.0294863  -0.07953437  0.12629327 -0.00639837  0.10256018  0.00398439
  0.07784899 -0.00953191 -0.10588035 -

## Load pre-trained model

In [39]:
#model= Doc2Vec.load("d2vz.model")

# Run Logistic Regression on vector represenations of words

In [40]:
def vec_for_learning(model, tagged_docs):
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in tagged_docs])
    return targets, regressors

In [52]:
y_train, X_train = vec_for_learning(model, train_tagged_data)
y_test, X_test = vec_for_learning(model, test_tagged_data)

In [53]:
sentiment = {'olumsuz':0, 'notr':1, 'olumlu':2}
inv_sentiment = {v:k for k, v in sentiment.items()}
inv_sentiment

{0: 'olumsuz', 1: 'notr', 2: 'olumlu'}

In [54]:
y_train = np.array([sentiment[s] for s in y_train])
y_test = np.array([sentiment[s] for s in y_test])
y_train

array([0, 0, 0, ..., 0, 1, 2])

In [55]:
y_test

array([0, 0, 0, ..., 1, 1, 2])

In [56]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [57]:
from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.43390222736476713
Testing F1 score: 0.43095281928906826


In [58]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[731, 436, 210],
       [486, 467, 211],
       [300, 314, 302]])

In [59]:
len(y_pred)

3457

## To-Do

Zemberek correct words