In [None]:
import pandas as pd
import nltk
import json
import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
matplotlib.rcParams["figure.figsize"] = (20,10)
import seaborn as sns
from collections import Counter
import sklearn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
## loading datasets train and test
train_data = pd.read_csv('/content/sample_data/train.csv')
test_data = pd.read_csv('/content/sample_data/train.csv')

## loading stopwords dataset
with open('/content/sample_data/turkce_stopwords.json', 'r', encoding='utf-8') as file:
    stopwords_data = json.load(file)
turkish_stopwords = set(stopwords_data['stopwords'])


In [None]:
## showing train dataset
print("Train Data Sample:")
print(train_data.head())

In [None]:
################################################# tensorflow ###########################################################

In [None]:
train_data = train_data.sample(frac=0.5, random_state=42)
test_data = test_data.sample(frac=0.5, random_state=42)

# Preprocessing text: Lowercase, remove stopwords, and remove punctuation
def preprocess_text(text, stop_words):
    # Remove punctuation using regex
    text = text.lower() # Lowercase text first
    text = "".join([char for char in text if char.isalnum() or char.isspace()])  # Keep only alphanumeric and spaces
    # Split the text into words
    words = text.split()  # Split into a list of words
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]  # Remove stopwords
    # Join back the words into a sentence
    return ' '.join(filtered_words)  # Join back into a single string

print("\nPreprocessing text data...")
train_data['processed_text'] = train_data['text'].apply(lambda x: preprocess_text(x, turkish_stopwords))
test_data['processed_text'] = test_data['text'].apply(lambda x: preprocess_text(x, turkish_stopwords))
print("Datasets are ready!")

# Tokenization and vectorization using TensorFlow
max_features = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_data['processed_text'])

X_train = tokenizer.texts_to_sequences(train_data['processed_text'])
X_test = tokenizer.texts_to_sequences(test_data['processed_text'])

# Pad sequences to ensure consistent length
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=100, padding='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=100, padding='post')

# Convert labels to categorical format
label_map = {'Negative': 0, 'Notr': 1, 'Positive': 2}
y_train = train_data['label'].map(label_map).values
y_test = test_data['label'].map(label_map).values
y_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=3)

# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16, input_length=500),  # Reduced output_dim
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),  # Reduced units
    tf.keras.layers.Dense(3, activation='softmax')
])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
print("\nTraining the model...")
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = tf.argmax(y_pred, axis=1)
y_test_classes = tf.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_classes, y_pred_classes) * 100
print(f"\nAccuracy: {accuracy:.2f}%")
print(classification_report(y_test_classes, y_pred_classes))


# Function to predict sentiment of new text
def predict_sentiment(text):
    processed_text = preprocess_text(text, turkish_stopwords).numpy().decode('utf-8')
    vectorized_text = tokenizer.texts_to_sequences([processed_text])
    padded_text = tf.keras.preprocessing.sequence.pad_sequences(vectorized_text, maxlen=100, padding='post')
    prediction = model.predict(padded_text)[0]
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    predicted_class = np.argmax(prediction)
    return sentiment_map[predicted_class], prediction
# Test the model with new input
print("\nTest with new input...")

def predict_sentiment(text):
    processed_text = preprocess_text(text, turkish_stopwords)
    vectorized_text = vectorizer.transform([processed_text]).toarray()
    probabilities = model.predict(vectorized_text)[0]
    sentiment_index = tf.argmax(probabilities).numpy()
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    predicted_sentiment = sentiment_map[sentiment_index]

    return predicted_sentiment, probabilities


user_input = input("Enter a Turkish sentence for sentiment analysis: ")
try:
    sentiment, probabilities = predict_sentiment(user_input)
    print(f"\nSentiment: {sentiment}")
    print("Probabilities:")
    print(f"  Negative: {probabilities[0]:.4f}")
    print(f"  Neutral: {probabilities[1]:.4f}")
    print(f"  Positive: {probabilities[2]:.4f}")
except Exception as e:
    print(f"Error processing input: {e}")



In [None]:
######################################### NLTK ######################################

In [None]:
train_data = train_data.sample(frac=0.7, random_state=42)  # Use 20% of the training data

## lowercasing, removing stopwords, and punctuation the text
def preprocess_text(text, stop_words):
    words = nltk.word_tokenize(text.lower())
    processed_words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(processed_words)

In [None]:
## creating new raw dataset
print("\nPreprocessing text data...")
train_data['processed_text'] = train_data['text'].apply(lambda x: preprocess_text(x, turkish_stopwords))
test_data['processed_text'] = test_data['text'].apply(lambda x: preprocess_text(x, turkish_stopwords))
print("datasets are ready")

In [None]:
## tf-idf vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['processed_text']).toarray()
X_test = vectorizer.transform(test_data['processed_text']).toarray()
y_train = train_data['label']
y_test = test_data['label']

In [None]:
## create model
model = MultinomialNB()
model.fit(X_train, y_train)
## evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracy:.2f}%")
## graphics for accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title(f'Normalized Confusion Matrix (Accuracy: {accuracy:.2f}%)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
## predict the sentiment of a new text
def predict_sentiment(text):
    processed_text = preprocess_text(text, turkish_stopwords)
    vectorized_text = vectorizer.transform([processed_text]).toarray()
    prediction = model.predict(vectorized_text)[0]
    probabilities = model.predict_proba(vectorized_text)[0]
    sentiment_map = {'Negative': 'Negative', 'Notr': 'Neutral', 'Positive': 'Positive'}
    return sentiment_map[prediction], probabilities


In [None]:
## new text input
print("\nTest with new input...")
user_input = input("Enter a Turkish sentence for sentiment analysis: ")
sentiment, probabilities = predict_sentiment(user_input)
print(f"\nSentiment: {sentiment}")
print("Probabilities:")
print(f"  Negative: {probabilities[0]:.4f}")
print(f"  Neutral: {probabilities[1]:.4f}")
print(f"  Positive: {probabilities[2]:.4f}")

In [None]:
#############################  data process ########################

In [None]:

file_path = "/content/turkce_stopwords.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

stopwords = data["stopwords"]
stopwords_df = pd.DataFrame(stopwords, columns=["Stopwords"])

print(stopwords_df.head(10))

In [None]:
df1 = pd.read_csv(r'/content/test.csv')
df1.sample(10)

In [None]:
df1.shape

In [None]:
df1.columns

In [None]:
list(df1)

In [None]:
df1['label'].unique()

In [None]:
df1['label'].value_counts()

In [None]:
df2 = df1.drop(['text'], axis='columns')
df2.shape

In [None]:
df2.isnull().sum()

In [None]:
df3=df2.dropna()
df3.isnull().sum()

In [None]:
df3.sample(5)

In [None]:
################## train ######################

In [None]:
df4 = pd.read_csv(r'/content/train.csv')
df4.sample(10)

In [None]:
df4.shape

In [None]:
df4.columns

In [None]:
df4['text'].unique()

In [None]:
df4['text'].value_counts()

In [None]:
df4['dataset'].unique()

In [None]:
df4['dataset'].value_counts()

In [None]:
df4['label'].unique()

In [None]:
df4['label'].value_counts()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=df4, x='label', palette="viridis")
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

In [None]:
############################# FEature engineering #############################################

In [None]:
df4 = df4.drop(['dataset'], axis='columns')
df4.shape
df4.sample(5)

In [None]:
df4_po = df4[df4['label'] == 'Positive']
df4_ne = df4[df4['label'] == 'Negative']
df4_not = df4[df4['label'] == 'Notr']

In [None]:
df4_po.to_csv("positive_sentiments.csv", index=False)
df4_ne.to_csv("negative_sentiments.csv", index=False)
df4_not.to_csv("notr_sentiments.csv", index=False)