In [None]:
import pandas as pd 
import nltk
import re 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import TfidfVectorizer

Ancak, Metin gösterimi adımına geçmeden önce, önce ön işleme tabi tutulması gereken temizlenmiş bir veri kümesi elde etmeliyiz. 

## Veri kümesini yükleme ve temizleme

Veri kümesi olarak, [Twitter US Airline Sentiment](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) veri kümesini kullanacağız.

In [None]:
DATA_DIR = "https://media.githubusercontent.com/media/yapay-ogrenme/casgem-eu-project-training-on-data-mining/main/PART2/Day14-NLP/notebooks/datasets/"

DATASET_PATH = DATA_DIR + "twitter_airline_sentiment_tweets.csv"

In [None]:
df = pd.read_csv(DATASET_PATH)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
sns.countplot(x = "airline_sentiment", data = df)

## Temel Ön İşlemler


In [None]:
data = df[["airline_sentiment", "text"]]
data

In [None]:
def sentiment(x):
    if x == 'positive':
        return 1
    elif x == 'negative':
        return -1
    else:
        return 0

In [None]:
data["airline_sentiment_label"] = data["airline_sentiment"].apply(sentiment)
data

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = stopwords.words('english')

def clean_the_tweet(text):
  tokens= nltk.word_tokenize(re.sub("[^a-zA-Z]", " ",text))
  tokens = [token.lower() for token in tokens if token not in stopwords]
  return ' '.join(tokens[2:])

In [None]:
print(f"Orignal Text : {data.text[11]}")
print()
print(f"Preprocessed Text : {clean_the_tweet(data.text[11])}")

In [None]:
data["clean_text"] = data.text.map(clean_the_tweet)
data.head()

In [None]:
data.iloc[10].text

In [None]:
data.iloc[10].clean_text

In [None]:
data

## TF-IDF

In [None]:
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True)
X_tf_idf = vectorizer.fit_transform(data.clean_text)

In [None]:
X_tf_idf.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_tf_idf, data['airline_sentiment_label'], random_state=42)

## SVM

In [None]:
from sklearn import svm

SVM = svm.SVC(probability=True)

fit = SVM.fit(x_train, y_train)

In [None]:
y_pred = fit.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_pred, y_test)
accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

cm = confusion_matrix(y_pred, y_test)

plt.figure()
plot_confusion_matrix(cm,figsize=(12,8), hide_ticks=True,cmap=plt.cm.Reds)
plt.xticks(range(2), ['Negative', 'Neutral', 'Positive'], fontsize=16,color='black')
plt.yticks(range(2), ['Negative', 'Neutral', 'Positive'], fontsize=16)
plt.show()