In [None]:
import pandas as pd 
import nltk
import re 
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
# Word2vec
import gensim

Ancak, Metin gösterimi adımına geçmeden önce, önce ön işleme tabi tutulması gereken temizlenmiş bir veri kümesi elde etmeliyiz. 

## Veri kümesini yükleme ve temizleme

Veri kümesi olarak, [Twitter US Airline Sentiment](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) veri kümesini kullanacağız.

In [None]:
DATA_DIR = "https://media.githubusercontent.com/media/yapay-ogrenme/casgem-eu-project-training-on-data-mining/main/PART2/Day14-NLP/notebooks/datasets/"

DATASET_PATH = DATA_DIR + "twitter_airline_sentiment_tweets.csv"

In [None]:
df = pd.read_csv(DATASET_PATH)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
sns.countplot(x = "airline_sentiment", data = df)

## Temel Ön İşlemler


In [None]:
data = df[["airline_sentiment", "text"]]
data

In [None]:
def sentiment(x):
    if x == 'positive':
        return 1
    elif x == 'negative':
        return -1
    else:
        return 0

In [None]:
data["airline_sentiment_label"] = data["airline_sentiment"].apply(sentiment)
data

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = stopwords.words('english')

def clean_the_tweet(text):
  tokens= nltk.word_tokenize(re.sub("[^a-zA-Z]", " ",text))
  tokens = [token.lower() for token in tokens if token not in stopwords]
  return ' '.join(tokens[2:])

In [None]:
print(f"Orignal Text : {data.text[11]}")
print()
print(f"Preprocessed Text : {clean_the_tweet(data.text[11])}")

In [None]:
data["clean_text"] = data.text.map(clean_the_tweet)
data.head()

In [None]:
data.iloc[10].text

In [None]:
data.iloc[10].clean_text

In [None]:
data

## Word2Vec

In [None]:
%%time
documents = [_text.split() for _text in data.clean_text] 

In [None]:
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 16
W2V_MIN_COUNT = 10


w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents)

In [None]:
vocab_len = len(w2v_model.wv.vocab)
print("Vocab size", vocab_len)

In [None]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

In [None]:
w2v_model.wv.most_similar("airlines")

In [None]:
w2v_model.wv.vectors[0].shape[0]

In [None]:
w2v_model.wv["american"].shape

In [None]:
def get_mean_vector(sentence):
  tokens = nltk.word_tokenize(sentence)
  # remove out-of-vocabulary words
  words = [word for word in tokens if word in w2v_model.wv]
  if len(words) >= 1:
      return np.mean(w2v_model[words], axis=0)
  else:
      return np.zeros(300)


In [None]:
content_vector = get_mean_vector(data.iloc[7].clean_text)
content_vector

In [None]:
data["content_vector"] = data["clean_text"].apply(get_mean_vector)
data

In [None]:
data.iloc[10].clean_text

In [None]:
data.iloc[10]["content_vector"].shape

In [None]:
X = np.zeros((14640, 300))

for i in range(len(data)):
  X[i] = data.iloc[i]["content_vector"]

X.shape

In [None]:
y = data["airline_sentiment_label"].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.shape

In [None]:
y_train

In [None]:
from sklearn import svm

SVM = svm.SVC(probability=True)

fit = SVM.fit(X_train, y_train)

In [None]:
y_pred = fit.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_pred, y_test)
accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

cm = confusion_matrix(y_pred, y_test)

plt.figure()
plot_confusion_matrix(cm,figsize=(12,8), hide_ticks=True,cmap=plt.cm.Reds)
plt.xticks(range(2), ['Negative', 'Neutral', 'Positive'], fontsize=16,color='black')
plt.yticks(range(2), ['Negative', 'Neutral', 'Positive'], fontsize=16)
plt.show()