In [71]:
import os 
import tweepy 
from dotenv import load_dotenv 
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
import pandas as pd 
import numpy as np 


In [72]:
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('turkish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/toygunozyurek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/toygunozyurek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [73]:
consumer_key = os.getenv('consumer_key')
consumer_key_secret = os.getenv('consumer_key_secret')
twitter_acces_token = os.getenv('twitter_acces_token')
twitter_acces_token_secret = os.getenv('twitter_acces_token_secret')


auth = (tweepy.OAuthHandler(consumer_key, consumer_key_secret, twitter_acces_token, twitter_acces_token_secret))

api = tweepy.API(auth)

In [74]:
load_dotenv()

True

In [75]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = 'ISO-8859-1',header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   0       1600000 non-null  int64 
 1   1       1600000 non-null  int64 
 2   2       1600000 non-null  object
 3   3       1600000 non-null  object
 4   4       1600000 non-null  object
 5   5       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [77]:
df  = df[[0,5]] # 0 is the sentiment and 5 is the tweet 
df.columns = ['sentiment','tweet']

In [78]:
df['sentiment']  = df['sentiment'].map({0:0,4:1,2:2}) # Replace 4 with 1 and 0 with 0 and 2 with 2 

In [79]:
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

In [80]:
df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

# Temizlenmiş tweetler ve etiketler
tweets = df['cleaned_tweet'].tolist()
labels = df['sentiment'].tolist()

In [81]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

In [82]:
X_train , X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state= 42)

In [83]:
tokenizer  = Tokenizer(num_words = 5000, split = ' ')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [84]:
max_len = max(max(len(x) for x in X_train), max(len(x) for x in X_test))
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [85]:
X_train = pad_sequences(X_train)
X_test = pad_sequences(X_test)

In [86]:
model = Sequential()
model.add(Embedding(input_dim = 5000,output_dim = 128, input_length = max_len))



In [87]:
model.add(SpatialDropout1D(0.2))

In [88]:
model.add(LSTM(150,dropout=0.2,recurrent_dropout = 0.2))

In [89]:
model.add(Dense(3, activation='softmax'))

In [90]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 40, 128)           640000    
                                                                 
 spatial_dropout1d_2 (Spati  (None, 40, 128)           0         
 alDropout1D)                                                    
                                                                 
 lstm_2 (LSTM)               (None, 150)               167400    
                                                                 
 dense_2 (Dense)             (None, 3)                 453       
                                                                 
Total params: 807853 (3.08 MB)
Trainable params: 807853 (3.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [91]:
print(type(X_train), type(y_train))
print(type(X_test), type(y_test))


<class 'numpy.ndarray'> <class 'list'>
<class 'numpy.ndarray'> <class 'list'>


In [92]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Hedef verilerin şekillerini kontrol edin ve tek boyutlu hale getirin
if y_train.ndim != 1:
    y_train = y_train.reshape(-1)
if y_test.ndim != 1:
    y_test = y_test.reshape(-1)

In [93]:
print(type(X_train), type(y_train))
print(type(X_test), type(y_test))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [94]:
print(X_train.shape)
print(X_test.shape)


(1280000, 40)
(320000, 40)


In [95]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)


Epoch 1/10
20000/20000 - 1916s - loss: 0.4301 - accuracy: 0.7992 - val_loss: 0.4056 - val_accuracy: 0.8138 - 1916s/epoch - 96ms/step
Epoch 2/10
20000/20000 - 1955s - loss: 0.3996 - accuracy: 0.8175 - val_loss: 0.3959 - val_accuracy: 0.8194 - 1955s/epoch - 98ms/step
Epoch 3/10
20000/20000 - 1907s - loss: 0.3872 - accuracy: 0.8246 - val_loss: 0.3913 - val_accuracy: 0.8222 - 1907s/epoch - 95ms/step
Epoch 4/10
20000/20000 - 1976s - loss: 0.3796 - accuracy: 0.8287 - val_loss: 0.3884 - val_accuracy: 0.8236 - 1976s/epoch - 99ms/step
Epoch 5/10
20000/20000 - 2007s - loss: 0.3739 - accuracy: 0.8318 - val_loss: 0.3886 - val_accuracy: 0.8242 - 2007s/epoch - 100ms/step
Epoch 6/10
20000/20000 - 2009s - loss: 0.3702 - accuracy: 0.8337 - val_loss: 0.3884 - val_accuracy: 0.8249 - 2009s/epoch - 100ms/step
Epoch 7/10
20000/20000 - 2017s - loss: 0.3674 - accuracy: 0.8355 - val_loss: 0.3894 - val_accuracy: 0.8243 - 2017s/epoch - 101ms/step
Epoch 8/10
20000/20000 - 1968s - loss: 0.3651 - accuracy: 0.8364 -

<keras.src.callbacks.History at 0x1791a7ed0>

In [96]:
new_tweets = []
print("Lütfen analiz etmek istediğiniz tweetleri girin (bitirmek için 'q' tuşuna basın):")
while True:
    tweet = input("Tweet: ")
    if tweet.lower() == 'q':
        break
    new_tweets.append(tweet)


Lütfen analiz etmek istediğiniz tweetleri girin (bitirmek için 'q' tuşuna basın):


In [97]:
preprocessed_new_tweets = [preprocess_text(tweet) for tweet in new_tweets]

# Yeni tweetleri sayısal değerlere dönüştürme
new_sequences = tokenizer.texts_to_sequences(preprocessed_new_tweets)

# Yeni tweetleri pad etme
new_padded = pad_sequences(new_sequences, maxlen=max_len)

# Tahmin yapma
predictions = model.predict(new_padded)



In [99]:
sentiment_labels = ['Negative', 'Neutral', 'Positive']
for tweet, prediction in zip(new_tweets, predictions):
    sentiment = sentiment_labels[prediction.argmax()]
    print(f"Tweet: {tweet}")
    print(f"Predicted Sentiment: {sentiment}")
    print("\n")

Tweet: bugün çok iyi hissediyorum.
Predicted Sentiment: Neutral


