In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

df_train = pd.read_csv('train_tweets.csv')
print(df_train.head(10))

print("\n\n\n")

df_test = pd.read_csv("test_tweets.csv")
print(df_test.head(10))


df_mixed_train = pd.read_csv("train_tweets_mixed.csv")
print(df_test.head(10))

                                               tweet sentiment
0  trt1 e bak beni bloklamış benim vergimle beni ...   Negatif
1                      sabah sabah siktirin gidin ya   Negatif
2                        Bu adam senimi sikdi bummm    Negatif
3   Katliamdan kaçan Suriye liler gitsin Suriye d...   Negatif
4                    gercekten pisliginde bogul amk    Negatif
5                        Şık bir hareket olur Brate    Pozitif
6  Bilginliği ile hava atmaya çalışan gerzek bir ...   Negatif
7  Yine kazık gibi oldu var mı kucağa gelmek iste...   Negatif
8  nerde salak nerde gerizeka varsa elimle koymuş...   Negatif
9                                nerdesin len tirrek   Negatif




                                               tweet sentiment
0  Anne bir sanatçıdır, en güzel eseri de yavrusu...   Pozitif
1                Ben söğüşledim, birazda sen söğüşle   Negatif
2  Şerefsizlik, sözde sanatçıların vazgeçemediği ...   Negatif
3              Kendisi de bilmiyordur çünkü beyinsi

In [2]:
x_train = df_train.tweet
y_train = df_train.sentiment

x_test= df_test.tweet
y_test = df_test.sentiment

x_mixed_train = df_mixed_train.tweet
y_mixed_train = df_mixed_train.sentiment

x_mixed_test= df_test.tweet
y_test = df_test.sentiment


In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_test)

print("------Naive Bayes, base training------")
my_labels = ["Pozitif", "Negatif"]
print('accuracy %s' % accuracy_score(y_pred, y_test))
NB_base_report = classification_report(y_test, y_pred,target_names=my_labels)
print(NB_base_report)

------Naive Bayes, base training------
accuracy 0.8472824274758253
              precision    recall  f1-score   support

     Pozitif       0.86      0.78      0.82      1309
     Negatif       0.84      0.90      0.87      1690

    accuracy                           0.85      2999
   macro avg       0.85      0.84      0.84      2999
weighted avg       0.85      0.85      0.85      2999



In [4]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_mixed_train, y_mixed_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_mixed_test)

print("------Naive Bayes, data augmented training------")

print('accuracy %s' % accuracy_score(y_pred, y_test))
NB_augmented_report = classification_report(y_test, y_pred,target_names=my_labels)
print(NB_augmented_report)


------Naive Bayes, data augmented training------
accuracy 0.8109369789929977
              precision    recall  f1-score   support

     Pozitif       0.96      0.59      0.73      1309
     Negatif       0.76      0.98      0.85      1690

    accuracy                           0.81      2999
   macro avg       0.86      0.79      0.79      2999
weighted avg       0.85      0.81      0.80      2999



In [5]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(x_train, y_train)


y_pred = sgd.predict(x_test)
print("------Linear Support Vector Machine, base training------")

print('accuracy %s' % accuracy_score(y_pred, y_test))
LSVM_base_report = classification_report(y_test, y_pred,target_names=my_labels)
print(LSVM_base_report)

------Linear Support Vector Machine, base training------
accuracy 0.7669223074358119
              precision    recall  f1-score   support

     Pozitif       0.93      0.50      0.65      1309
     Negatif       0.72      0.97      0.82      1690

    accuracy                           0.77      2999
   macro avg       0.82      0.74      0.74      2999
weighted avg       0.81      0.77      0.75      2999



In [6]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(x_mixed_train, y_mixed_train)


print("------Linear Support Vector Machine, data augmented training------")

y_pred = sgd.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
LSVM_augmented_report = classification_report(y_test, y_pred,target_names=my_labels)
print(LSVM_augmented_report)

------Linear Support Vector Machine, data augmented training------
accuracy 0.7092364121373791
              precision    recall  f1-score   support

     Pozitif       0.94      0.36      0.52      1309
     Negatif       0.66      0.98      0.79      1690

    accuracy                           0.71      2999
   macro avg       0.80      0.67      0.65      2999
weighted avg       0.78      0.71      0.67      2999



In [7]:
reports =  [NB_base_report,NB_augmented_report,LSVM_base_report,LSVM_augmented_report]


for report in reports:
  print(report)

              precision    recall  f1-score   support

     Pozitif       0.86      0.78      0.82      1309
     Negatif       0.84      0.90      0.87      1690

    accuracy                           0.85      2999
   macro avg       0.85      0.84      0.84      2999
weighted avg       0.85      0.85      0.85      2999

              precision    recall  f1-score   support

     Pozitif       0.96      0.59      0.73      1309
     Negatif       0.76      0.98      0.85      1690

    accuracy                           0.81      2999
   macro avg       0.86      0.79      0.79      2999
weighted avg       0.85      0.81      0.80      2999

              precision    recall  f1-score   support

     Pozitif       0.93      0.50      0.65      1309
     Negatif       0.72      0.97      0.82      1690

    accuracy                           0.77      2999
   macro avg       0.82      0.74      0.74      2999
weighted avg       0.81      0.77      0.75      2999

              preci