In [None]:
import numpy as np
import codecs
import os
import sys
import re
from sklearn.model_selection import train_test_split, KFold
from TurkishStemmer import TurkishStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from unicode_tr import unicode_tr
from collections import Counter
from random import shuffle
from math import log, inf
from operator import itemgetter

In [8]:
stemmer=TurkishStemmer()

## Data Retrieval and Preprocessing

In [9]:
def findfiles(path,flist):
    dirs = os.listdir(path)
    for df in dirs:
        if os.path.isdir(path+"/"+df):
            findfiles(path+"/"+df,flist)
        else:
            flist.append(path+"/"+df)

In [10]:
path = './42bin_haber/news'
categories = os.listdir(path)[1:]
categories = ['ekonomi','kultur-sanat','magazin','saglik','siyaset','spor','teknoloji']
news_files = {}
for cat in categories:
    flist = []
    findfiles(path+"/"+cat,flist)
    news_files[cat] = flist[1:]

In [11]:
len(categories)

7

In [12]:
def preprocess(words,stop_words = stopwords.words('turkish'),url_regex=None):
    #Remove URLS
    if url_regex:
        words = [word for word in words if not re.match(url_regex,word)]

    #Remove trash characters
    words = [re.sub("\xad|\x95|\x80|\x82|\x93|\x94|\x91|\x92|\x96|^\'+|^\*+|^-+|\'+$", "", word) for word in words]

    #Remove nonalphanumeric
    words = [word for word in words if not re.match("\W", word)]

    #Lower all words
    words = [unicode_tr(word.strip()).lower() for word in words if word.strip()!=""]
    
    #Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    #Stemming
    words = [stemmer.stem(word) for word in words]
    
    return words

In [13]:
category_corpus = {}
test_files = {}
all_words = []
X = []
y = []
for cat in categories:
    for f in news_files[cat]:
        with open(f) as file:
            lines = file.readlines()
            lines = ''.join(lines)

            words = word_tokenize(lines)
            words = preprocess(words)

            X.append(words)
            y.append(cat)

## k-fold partitioning with k=5

In [14]:
def shuffle_together(x,y):
    together = list(zip(x,y))
    shuffle(together)
    x,y = zip(*together)
    return list(x),list(y)

In [15]:
X_trains =[]
X_tests =[]
y_trains = []
y_tests = []
X_shuffled, y_shuffled = shuffle_together(X,y)

for i in range(5):
    print(i)
    test_start = int(i*len(X_shuffled)/5)
    test_end = int((i+1)*len(X_shuffled)/5)
    
    X_train = X_shuffled[:test_start]+X_shuffled[test_end:]
    X_test = X_shuffled[test_start:test_end]
    y_train = y_shuffled[:test_start]+y_shuffled[test_end:]
    y_test = y_shuffled[test_start:test_end]
    
    X_trains.append(X_train)
    X_tests.append(X_test)
    y_trains.append(y_train)
    y_tests.append(y_test)

0
1
2
3
4


## Naïve Bayes

In [16]:
def get_unigrams(words):
    return Counter(words)

In [17]:
def get_conditional_prob(test_words, class_words, class_prob):
    prob = 0
    total = class_words['Total']
    unique = len(class_words.keys())-1
    for word in test_words:
        # calculates p(word|class) with Laplace Smoothing
        prob += log((class_words[word]+1)/(total+unique))
      
    # This line considers class probability p(class)
    prob += log(class_prob)
    return prob

In [18]:
naive_results = []

# K-fold training
for i,X_train in enumerate(X_trains):
    print('in'+str(i))
    y_train = y_trains[i]
    
    # class probabilities
    P={cat: y_train.count(cat) for cat in categories}
    total = len(y_train)
    P={cat: P[cat]/total for cat in categories}
    
    category_corpus = {}
    for cat in categories:
        category_corpus[cat] = []
        
    for j,x in enumerate(X_train):
        category_corpus[y_train[j]] += x

    category_unigrams = {}
    for cat in categories:
        category_unigrams[cat] = get_unigrams(category_corpus[cat])
        category_unigrams[cat]['Total'] = sum(category_unigrams[cat].values())
    
    X_test = X_tests[i]
    y_test = y_tests[i]
    predictions= []
    print('out'+str(i))
    for x in X_test:        
        max_prob = -inf
        pred_cat = ""
        for cat in categories:
            pred_prob = get_conditional_prob(x, category_unigrams[cat], P[cat])
            if pred_prob > max_prob:
                max_prob = pred_prob
                pred_cat = cat

        predictions.append(pred_cat)
    
    # Append accuracy of i'th fold
    naive_results.append(sum(1 for x,y in zip(y_test, predictions) if x == y) / len(y_test))

in0
out0
in1
out1
in2
out2
in3
out3
in4
out4


In [19]:
print(naive_results)
np.mean(naive_results)

[0.90591841546805, 0.9217165762791795, 0.9198302287196416, 0.9207734024994105, 0.9198302287196416]


0.9176137703371847

## MLP - Word2Vec

In [20]:
def get_texts(X):
    frequency = defaultdict(int)
    for x in X:
        for token in x:
            frequency[token] +=1

    texts = [ [token for token in doc if frequency[token] > 1] 
            for doc in X]
    
    return texts

In [21]:
def get_averaged_doc(texts):
    averaged_doc = []
    shape = word_vectors.get_vector(texts[0][0]).shape
    new_words = []
    for doc in texts:
        temp = np.zeros(shape)
        counter = 0
        for token in doc:
            try:
                temp += word_vectors.get_vector(token)
            except:
                counter +=1
        
        new_words.append(counter)
        temp = temp/(len(doc)-counter)
        averaged_doc.append(temp)
    
    return np.array(averaged_doc),new_words

In [22]:
def get_accuracy(averaged,model,y):
    predictions = model.predict(averaged)
    _predictions = (predictions.argmax(1)[:,None] == np.arange(predictions.shape[1])).astype(int)
    accuracy = 1 - np.sum(np.abs(_predictions - y))/(2*len(y))
    return accuracy

In [25]:
from sklearn.preprocessing import LabelBinarizer
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout

mlp_accuracy = []
no_of_outputs = 7

for i,X_train in enumerate(X_trains):
    print(i)
    y_train = y_trains[i]
    y_test = y_tests[i]
    X_test = X_tests[i]
    
    encoder = LabelBinarizer()
    encoder.fit(y_train)

    y_train_t = encoder.transform(y_train)
    y_test_t = encoder.transform(y_test)

    texts_train = get_texts(X_train)
    texts_test  = get_texts(X_test)

    print("w2v")
    model = Word2Vec(texts_train, size=100, window=4, min_count=1)
    model.train(texts_train, total_examples=len(texts_train), epochs=50)
    model.wv.save("trained_word_vectors_train.pkl")
    word_vectors = KeyedVectors.load("trained_word_vectors_train.pkl")

    averaged_train,_ = get_averaged_doc(texts_train)
    averaged_test, new_words = get_averaged_doc(texts_test)

    print("mlp")
    model = Sequential()
    model.add(Dense(12, input_dim=100, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(no_of_outputs, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    test_accuracies = []
    train_accuracies = []

    for i in range(20):    
        print(i+1,end=" ")
        test_accuracies.append(get_accuracy(averaged_test,model,y_test_t))
        train_accuracies.append(get_accuracy(averaged_train,model,y_train_t))

        model.fit(averaged_train, y_train_t, epochs=1, batch_size=10)
        
    mlp_accuracy.append(test_accuracies[-1])

0
w2v
mlp
1 Epoch 1/1
2 Epoch 1/1
3 Epoch 1/1
4 Epoch 1/1
5 Epoch 1/1
6 Epoch 1/1
7 Epoch 1/1
8 Epoch 1/1
9 Epoch 1/1
10 Epoch 1/1
11 Epoch 1/1
12 Epoch 1/1
13 Epoch 1/1
14 Epoch 1/1
15 Epoch 1/1
16 Epoch 1/1
17 Epoch 1/1
18 Epoch 1/1
19 Epoch 1/1
20 Epoch 1/1
1
w2v
mlp
1 Epoch 1/1
2 Epoch 1/1
3 Epoch 1/1
4 Epoch 1/1
5 Epoch 1/1
6 Epoch 1/1
7 Epoch 1/1
8 Epoch 1/1
9 Epoch 1/1
10 Epoch 1/1
11 Epoch 1/1
12 Epoch 1/1
13 Epoch 1/1
14 Epoch 1/1
15 Epoch 1/1
16 Epoch 1/1
17 Epoch 1/1
18 Epoch 1/1
19 Epoch 1/1
20 Epoch 1/1
2
w2v
mlp
1 Epoch 1/1
2 Epoch 1/1
3 Epoch 1/1
4 Epoch 1/1
5 Epoch 1/1
6 Epoch 1/1
7 Epoch 1/1
8 Epoch 1/1
9 Epoch 1/1
10 Epoch 1/1
11 Epoch 1/1
12 Epoch 1/1
13 Epoch 1/1
14 Epoch 1/1
15 Epoch 1/1
16 Epoch 1/1
17 Epoch 1/1
18 Epoch 1/1
19 Epoch 1/1
20 Epoch 1/1
3
w2v
mlp
1 Epoch 1/1
2 Epoch 1/1
3 Epoch 1/1
4 Epoch 1/1
5 Epoch 1/1
6 Epoch 1/1
7 Epoch 1/1
8 Epoch 1/1
9 Epoch 1/1
10 Epoch 1/1
11 Epoch 1/1
12 Epoch 1/1
13 Epoch 1/1
14 Epoch 1/1
15 Epoch 1/1
16 Epoch 1/1
17 Epoch

4
w2v
mlp
1 Epoch 1/1
2 Epoch 1/1
3 Epoch 1/1
4 Epoch 1/1
5 Epoch 1/1
6 Epoch 1/1
7 Epoch 1/1
8 Epoch 1/1
9 Epoch 1/1
10 Epoch 1/1
11 Epoch 1/1
12 Epoch 1/1
13 Epoch 1/1
14 Epoch 1/1
15 Epoch 1/1
16 Epoch 1/1
17 Epoch 1/1
18 Epoch 1/1
19 Epoch 1/1
20 Epoch 1/1


In [26]:
import matplotlib.pyplot as plt
plt.plot(test_accuracies[1:], 'r')
plt.plot(train_accuracies[1:], 'b')
plt.legend(["test","train"])
plt.show()

<Figure size 640x480 with 1 Axes>

In [27]:
np.mean(mlp_accuracy)

0.9327517095024758

## SVM

In [28]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3,
                                            max_iter=5, tol=None)),])

svm_accuracies = []

for i in range(5):
    x_train = [' '.join(x) for x in X_trains[i]]
    y_train = y_trains[i]
    x_test  = [' '.join(x) for x in X_tests[i]]
    y_test  = y_tests[i]
    text_clf.fit(x_train, y_train)
    predicted = text_clf.predict(x_test)
    svm_accuracies.append(np.mean(predicted == y_test)) 

In [29]:
svm_accuracies

[0.9236029238387173,
 0.932563074746522,
 0.9297335534072153,
 0.930440933742042,
 0.9316199009667532]

In [30]:
np.mean(svm_accuracies)

0.92959207734025