In [32]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

df_train = pd.read_csv('train.csv')
print(df_train.head(10))

print("\n\n\n")

df_test = pd.read_csv("test.csv")
print(df_test.head(10))


df_mixed_train = pd.read_csv("train_mixed.csv")
print(df_test.head(10))

                                            headline   category
0                      Myanmar'da çatışmalar sürüyor      dunya
1  Ermenistan'da cumhurbaşkanı adayına silahlı sa...      dunya
2                             Süper Lig'den transfer       spor
3           Erdoğan: İç hesaplaşma ihtimali yüksek    siyaset
4                      AB'den güvenli internet adımı  teknoloji
5                AB "1 trilyon euro" için toplanıyor    ekonomi
6                    İpek Şenoğlu Cup 11-24 Şubatta       spor
7        Kordsa 120 iş liderini İstanbulda ağırladı    ekonomi
8       Bursada 900 milyon ton mermer rezervi çıktı    ekonomi
9               'Başbakan tek otorite olmak istiyor'    siyaset




                                     headline category
0   İsrail'den Gazze Şeridi'ne hava saldırısı    dunya
1     İdam edecek ama organlara el koymayacak    dunya
2              4 milyon dolar nafaka ödeyecek    dunya
3   Mahmud Abbas'ın 'Bağımsız Filistin' umudu    dunya
4              Br

In [33]:
x_train = df_train.headline
y_train = df_train.category

x_test= df_test.headline
y_test = df_test.category

x_mixed_train = df_mixed_train.headline
y_mixed_train = df_mixed_train.category

x_mixed_test= df_test.headline
y_test = df_test.category


In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_test)

print("------Naive Bayes, base training------")
my_categories = ["dunya", "ekonomi","spor","siyaset","teknoloji","kultur","saglik"]
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

------Naive Bayes, base training------
accuracy 0.6610820780817707
              precision    recall  f1-score   support

       dunya       0.67      0.76      0.71      1489
     ekonomi       0.84      0.37      0.52       903
        spor       1.00      0.03      0.06       298
     siyaset       0.97      0.18      0.31       373
   teknoloji       0.93      0.19      0.32       498
      kultur       0.63      0.97      0.76      2729
      saglik       1.00      0.01      0.03       216

    accuracy                           0.66      6506
   macro avg       0.86      0.36      0.39      6506
weighted avg       0.74      0.66      0.60      6506



In [35]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_mixed_train, y_mixed_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_mixed_test)

print("------Naive Bayes, data augmented training------")

my_categories = ["dunya", "ekonomi","spor","siyaset","teknoloji","kultur","saglik"]
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))


------Naive Bayes, data augmented training------
accuracy 0.692130341223486
              precision    recall  f1-score   support

       dunya       0.74      0.71      0.72      1489
     ekonomi       0.59      0.78      0.67       903
        spor       1.00      0.02      0.04       298
     siyaset       0.98      0.13      0.23       373
   teknoloji       0.93      0.17      0.29       498
      kultur       0.70      0.96      0.81      2729
      saglik       1.00      0.01      0.02       216

    accuracy                           0.69      6506
   macro avg       0.85      0.40      0.40      6506
weighted avg       0.75      0.69      0.63      6506



In [36]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(x_train, y_train)


y_pred = sgd.predict(x_test)
print("------Linear Support Vector Machine, base training------")

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

------Linear Support Vector Machine, base training------
accuracy 0.6967414694128496
              precision    recall  f1-score   support

       dunya       0.76      0.66      0.71      1489
     ekonomi       0.74      0.40      0.52       903
        spor       0.84      0.22      0.35       298
     siyaset       0.79      0.47      0.59       373
   teknoloji       0.79      0.48      0.60       498
      kultur       0.65      0.97      0.78      2729
      saglik       0.76      0.25      0.37       216

    accuracy                           0.70      6506
   macro avg       0.76      0.49      0.56      6506
weighted avg       0.72      0.70      0.67      6506



In [37]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(x_mixed_train, y_mixed_train)


print("------Linear Support Vector Machine, data augmented training------")

y_pred = sgd.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_categories))

------Linear Support Vector Machine, data augmented training------
accuracy 0.7230249000922225
              precision    recall  f1-score   support

       dunya       0.79      0.66      0.72      1489
     ekonomi       0.57      0.71      0.63       903
        spor       0.84      0.24      0.38       298
     siyaset       0.82      0.44      0.57       373
   teknoloji       0.82      0.48      0.61       498
      kultur       0.73      0.94      0.82      2729
      saglik       0.77      0.23      0.36       216

    accuracy                           0.72      6506
   macro avg       0.76      0.53      0.58      6506
weighted avg       0.74      0.72      0.70      6506

