In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud

In [2]:
# Ensure NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [3]:
# Read the JSON data using Pandas
data_1 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
data_2 = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

In [4]:
# Combine the data into a single DataFrame
data = pd.concat([data_1, data_2])
data.head()
len(data)

55328

In [5]:
# Drop duplicates
data = data.drop_duplicates()
len(data)

28617

In [6]:
# Clean text function
def clean_text(text):
    text = text.lower()
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x: x[0] != '@', text.split()))
    emoji = re.compile("["
                       u"\U0001F600-\U0001FFFF"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       "]+", flags=re.UNICODE)
    text = emoji.sub(r'', text)
    text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    return text

In [7]:
# Tokenize and clean text
def clean_tokenize(df):
    head_lines = []
    lines = df["headline"].values.tolist()
    stop_words = set(stopwords.words("english"))
    
    for line in lines:
        line = clean_text(line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha() and word not in stop_words]
        head_lines.append(words)
        
    return head_lines


In [8]:
# Tokenize and preprocess data
head_lines = clean_tokenize(data)
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(head_lines)
sequences = tokenizer_obj.texts_to_sequences(head_lines)
word_index = tokenizer_obj.word_index
vocab_size = len(word_index) + 1
max_length = 25

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
sentiment = data['is_sarcastic'].values
print(sentiment)

[1 0 0 ... 0 1 1]


In [9]:
# Shuffle and split data
indices = np.arange(lines_pad.shape[0])
np.random.shuffle(indices)
lines_pad = lines_pad[indices]
sentiment = sentiment[indices]

validation_split = 0.3
num_validation_samples = int(validation_split * lines_pad.shape[0])
X_train_pad = lines_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = lines_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

print(X_train_pad,y_train)
print(X_test_pad,y_test)


[[    2  4824  3346 ...     0     0     0]
 [  238   648  6049 ...     0     0     0]
 [ 9881   409   583 ...     0     0     0]
 ...
 [14506   792   304 ...     0     0     0]
 [ 1537  3526   270 ...     0     0     0]
 [   57  1413 16295 ...     0     0     0]] [1 1 0 ... 1 1 1]
[[ 6262    74     3 ...     0     0     0]
 [    2   719 12151 ...     0     0     0]
 [   35  4253   762 ...     0     0     0]
 ...
 [ 7457   647  2058 ...     0     0     0]
 [ 2787   587  2259 ...     0     0     0]
 [  144   391  1676 ...     0     0     0]] [1 0 0 ... 0 1 0]


In [10]:
# Load GloVe embeddings
embeddings_index = {}
embedding_dim = 100
GLOVE_DIR = "D:\ML\content"
with open(os.path.join(GLOVE_DIR, 'glove.twitter.27B.100d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [11]:
# Prepare embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [12]:
# Create embedding layer
embedding_layer = Embedding(vocab_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)




In [13]:
# Build the model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.25, input_shape=(max_length, embedding_dim)))
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(3, activation='softmax'))  # Output layer for 3 classes


  super().__init__(**kwargs)


In [14]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])


In [15]:
# Build the model by providing a sample input
model.build(input_shape=(None, max_length))


In [16]:
# Print model summary
print('Summary of the built model...')
print(model.summary())


Summary of the built model...


None


In [17]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - acc: 0.6620 - loss: 0.6106 - val_acc: 0.7486 - val_loss: 0.5160
Epoch 2/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - acc: 0.7428 - loss: 0.5278 - val_acc: 0.7800 - val_loss: 0.4722
Epoch 3/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - acc: 0.7663 - loss: 0.4908 - val_acc: 0.7877 - val_loss: 0.4576
Epoch 4/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - acc: 0.7804 - loss: 0.4654 - val_acc: 0.7958 - val_loss: 0.4382
Epoch 5/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - acc: 0.7920 - loss: 0.4494 - val_acc: 0.8093 - val_loss: 0.4159
Epoch 6/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - acc: 0.8013 - loss: 0.4283 - val_acc: 0.8112 - val_loss: 0.4117
Epoch 7/10
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step 

In [18]:
#model.save('sentiment_sarcasm_model.keras')


In [19]:
# Function to preprocess and clean the input text
def preprocess_input(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    return words

In [20]:
# Function to predict sentiment and sarcasm
def predict_sentiment_and_sarcasm(text, model, tokenizer, max_length):
    words = preprocess_input(text)
    sequences = tokenizer.texts_to_sequences([words])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequences)
    sentiment = 'Positive' if prediction >= 0.5 else 'Negative'
    sarcasm = 'Sarcastic' if prediction >= 0.5 else 'Not Sarcastic'
    return sentiment, sarcasm


In [21]:
# Test the model with an example input
test_input = "richard branson's global-warming donation nearly as much as cost of failed balloon trips"
sentiment, sarcasm = predict_sentiment_and_sarcasm(test_input, model, tokenizer_obj, max_length)

print(f"Sentiment: {sentiment}")
print(f"Sarcasm: {sarcasm}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
Sentiment: Negative
Sarcasm: Not Sarcastic


In [22]:
# Test the model with an example input
test_input = "richard branson's global-warming donation nearly as much as cost of failed balloon trips😨😰😥😓"
sentiment, sarcasm = predict_sentiment_and_sarcasm(test_input, model, tokenizer_obj, max_length)

print(f"Sentiment: {sentiment}")
print(f"Sarcasm: {sarcasm}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Sentiment: Negative
Sarcasm: Not Sarcastic


In [23]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import emoji

# Load the saved model
model = load_model('sentiment_sarcasm_model.h5')

# Initialize tokenizer (ensure it matches the one used during training)
def initialize_tokenizer():
    # Example tokenization initialization. Replace with actual tokenizer if saved and loaded.
    return Tokenizer()

tokenizer_obj = initialize_tokenizer()

# Define emoji mappings
positive_emojis = ["😀", "😃", "😄", "😁", "😆", "😅", "😂", "🤣", "😊", "😇", "🙂", "😉", "😌", "😍", "🥰", "😘", "😗", "😙", "😚", "😋", "😛", "😝", "😜", "🤪", "😎", "🤩", "🥳", "😏", "😬", "🤗"]
negative_emojis = ["🥹", "🥲", "☺️", "😐", "😑", "😶", "🙃", "😶‍🌫", "🤔", "🫣", "🤭", "🫡", "🫢", "🫡", "🤫", "🫠", "🤥", "😶", "🫥", "😐", "🫤", "😑", "🫨", "🙄", "😯", "😦", "😧", "😮", "😲", "🥱", "😴", "🤤", "😪", "😵", "🤐", "🥴", "🤢", "🤧", "😷", "🤒", "🤕", "🤑", "🤠"]
neutral_emojis = ["😞", "😔", "😟", "😕", "🙁", "☹️", "😣", "😖", "😫", "😩", "🥺", "😢", "😭", "😤", "😠", "😡", "🤬", "🤯", "😳", "🥵", "🥶", "😱", "😨", "😰", "😥", "😓", "😞", "😧", "😦", "😈", "👿", "👹", "👺", "💩", "😵", "😿"]

# Define functions for sentiment analysis
def detect_emojis(text):
    return [char for char in text if emoji.is_emoji(char)]

def classify_emojis(emojis):
    positive_count = sum(1 for e in emojis if e in positive_emojis)
    negative_count = sum(1 for e in emojis if e in negative_emojis)
    neutral_count = sum(1 for e in emojis if e in neutral_emojis)
    
    return {
        "positive_count": positive_count,
        "negative_count": negative_count,
        "neutral_count": neutral_count
    }

def preprocess_input(text, tokenizer, max_length):
    tokens = tokenizer.texts_to_sequences([text])
    padded_input = pad_sequences(tokens, maxlen=max_length, padding='post')
    return padded_input

def predict_sarcasm(text, model, tokenizer, max_length):
    padded_input = preprocess_input(text, tokenizer, max_length)
    prediction = model.predict(padded_input)
    return 'Sarcastic' if prediction >= 0.5 else 'Not Sarcastic'

def predict_sentence_sentiment(text, model, tokenizer, max_length):
    padded_input = preprocess_input(text, tokenizer, max_length)
    prediction = model.predict(padded_input)
    return 'Positive' if prediction >= 0.5 else 'Negative'

def get_final_sentiment(text, model, tokenizer, max_length):
    detected_emojis = detect_emojis(text)
    emoji_counts = classify_emojis(detected_emojis)
    
    sarcasm = predict_sarcasm(text, model, tokenizer, max_length)
    sentence_sentiment = predict_sentence_sentiment(text, model, tokenizer, max_length)
    
    # Determine emoji sentiment
    if emoji_counts['positive_count'] > emoji_counts['negative_count']:
        emoji_sentiment = 'Positive'
    elif emoji_counts['negative_count'] > emoji_counts['positive_count']:
        emoji_sentiment = 'Negative'
    else:
        emoji_sentiment = 'Neutral'
    
    # Combine results
    if sarcasm == 'Sarcastic':
        final_sentiment = 'Sarcastic'
    elif emoji_sentiment == 'Positive':
        final_sentiment = 'Positive'
    elif emoji_sentiment == 'Negative':
        final_sentiment = 'Negative'
    else:
        final_sentiment = sentence_sentiment
    
    return {
        'sentence_sentiment': sentence_sentiment,
        'sarcasm': sarcasm,
        'emoji_sentiment': emoji_sentiment,
        'emoji_counts': emoji_counts,
        'final_sentiment': final_sentiment
    }

# Example usage
text = "richard branson's global-warming donation nearly as much as cost of failed balloon trips😨😰😥😓"
final_results = get_final_sentiment(text, model, tokenizer_obj, max_length)

# Print results
print(f"Sentence Sentiment: {final_results['sentence_sentiment']}")
print(f"Sarcasm: {final_results['sarcasm']}")
print(f"Emoji Sentiments: {final_results['emoji_sentiment']}")
print(f"Emoji Counts: {final_results['emoji_counts']}")
print(f"Final Sentiment: {final_results['final_sentiment']}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Sentence Sentiment: Negative
Sarcasm: Not Sarcastic
Emoji Sentiments: Neutral
Emoji Counts: {'positive_count': 0, 'negative_count': 0, 'neutral_count': 4}
Final Sentiment: Negative


In [24]:
text = "Oh, great. Another Monday. I was just hoping for a reason to hate my life more"
final_results = get_final_sentiment(text, model, tokenizer_obj, max_length)

# Print results
print(f"Sentence Sentiment: {final_results['sentence_sentiment']}")
print(f"Sarcasm: {final_results['sarcasm']}")
print(f"Emoji Sentiments: {final_results['emoji_sentiment']}")
print(f"Emoji Counts: {final_results['emoji_counts']}")
print(f"Final Sentiment: {final_results['final_sentiment']}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Sentence Sentiment: Negative
Sarcasm: Not Sarcastic
Emoji Sentiments: Neutral
Emoji Counts: {'positive_count': 0, 'negative_count': 0, 'neutral_count': 0}
Final Sentiment: Negative


In [25]:
text = "frustrated jesus christ forced to find 22nd vessel for reincarnation after death of charles manson"
final_results = get_final_sentiment(text, model, tokenizer_obj, max_length)

# Print results
print(f"Sentence Sentiment: {final_results['sentence_sentiment']}")
print(f"Sarcasm: {final_results['sarcasm']}")
print(f"Emoji Sentiments: {final_results['emoji_sentiment']}")
print(f"Emoji Counts: {final_results['emoji_counts']}")
print(f"Final Sentiment: {final_results['final_sentiment']}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Sentence Sentiment: Negative
Sarcasm: Not Sarcastic
Emoji Sentiments: Neutral
Emoji Counts: {'positive_count': 0, 'negative_count': 0, 'neutral_count': 0}
Final Sentiment: Negative


In [26]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import emoji

# Load the saved model
model = load_model('sentiment_sarcasm_model.h5')

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])


# Initialize tokenizer (ensure it matches the one used during training)
def initialize_tokenizer():
    # Load your tokenizer here
    tokenizer = Tokenizer()
    # Example of loading saved tokenizer
    # with open('tokenizer.pickle', 'rb') as handle:
    #     tokenizer = pickle.load(handle)
    return tokenizer

tokenizer_obj = initialize_tokenizer()

# Define emoji mappings
positive_emojis = ["😀", "😃", "😄", "😁", "😆", "😅", "😂", "🤣", "😊", "😇", "🙂", "😉", "😌", "😍", "🥰", "😘", "😗", "😙", "😚", "😋", "😛", "😝", "😜", "🤪", "😎", "🤩", "🥳", "😏", "😬", "🤗"]
negative_emojis = ["🥹", "🥲", "☺️", "😐", "😑", "😶", "🙃", "😶‍🌫", "🤔", "🫣", "🤭", "🫡", "🫢", "🫡", "🤫", "🫠", "🤥", "😶", "🫥", "😐", "🫤", "😑", "🫨", "🙄", "😯", "😦", "😧", "😮", "😲", "🥱", "😴", "🤤", "😪", "😵", "🤐", "🥴", "🤢", "🤧", "😷", "🤒", "🤕", "🤑", "🤠"]
neutral_emojis = ["😞", "😔", "😟", "😕", "🙁", "☹️", "😣", "😖", "😫", "😩", "🥺", "😢", "😭", "😤", "😠", "😡", "🤬", "🤯", "😳", "🥵", "🥶", "😱", "😨", "😰", "😥", "😓", "😞", "😧", "😦", "😈", "👿", "👹", "👺", "💩", "😵", "😿"]

# Define functions for sentiment analysis
def detect_emojis(text):
    return [char for char in text if emoji.is_emoji(char)]

def classify_emojis(emojis):
    positive_count = sum(1 for e in emojis if e in positive_emojis)
    negative_count = sum(1 for e in emojis if e in negative_emojis)
    neutral_count = sum(1 for e in emojis if e in neutral_emojis)
    
    return {
        "positive_count": positive_count,
        "negative_count": negative_count,
        "neutral_count": neutral_count
    }

def preprocess_input(text, tokenizer, max_length):
    tokens = tokenizer.texts_to_sequences([text])
    padded_input = pad_sequences(tokens, maxlen=max_length, padding='post')
    return padded_input

def predict_sarcasm(text, model, tokenizer, max_length):
    padded_input = preprocess_input(text, tokenizer, max_length)
    prediction = model.predict(padded_input)
    # Adjust logic according to the shape of prediction
    if len(prediction[0]) == 2:  # Binary classification
        return 'Sarcastic' if prediction[0][1] > prediction[0][0] else 'Not Sarcastic'
    else:
        return 'Not Sarcastic'  # Default case, if shape is not as expected

def predict_sentence_sentiment(text, model, tokenizer, max_length):
    padded_input = preprocess_input(text, tokenizer, max_length)
    prediction = model.predict(padded_input)
    # Adjust logic according to the shape of prediction
    if len(prediction[0]) == 3:  # Assuming model has 3 outputs for sentiment classification
        return 'Positive' if prediction[0][2] > prediction[0][1] and prediction[0][2] > prediction[0][0] else 'Negative'
    else:
        return 'Negative'  # Default case, if shape is not as expected

def get_final_sentiment(text, model, tokenizer, max_length):
    detected_emojis = detect_emojis(text)
    emoji_counts = classify_emojis(detected_emojis)
    
    sarcasm = predict_sarcasm(text, model, tokenizer, max_length)
    sentence_sentiment = predict_sentence_sentiment(text, model, tokenizer, max_length)
    
    # Determine emoji sentiment
    if emoji_counts['positive_count'] > emoji_counts['negative_count']:
        emoji_sentiment = 'Positive'
    elif emoji_counts['negative_count'] > emoji_counts['positive_count']:
        emoji_sentiment = 'Negative'
    else:
        emoji_sentiment = 'Neutral'
    
    # Combine results
    if sarcasm == 'Sarcastic':
        final_sentiment = 'Sarcastic'
    elif emoji_sentiment == 'Positive':
        final_sentiment = 'Positive'
    elif emoji_sentiment == 'Negative':
        final_sentiment = 'Negative'
    else:
        final_sentiment = sentence_sentiment
    
    return {
        'sentence_sentiment': sentence_sentiment,
        'sarcasm': sarcasm,
        'emoji_sentiment': emoji_sentiment,
        'emoji_counts': emoji_counts,
        'final_sentiment': final_sentiment
    }

# Example usage
text = "richard branson's global-warming donation nearly as much as cost of failed balloon trips😨😰😥😓"
max_length = 100  # Example max length, adjust to your model's requirements
final_results = get_final_sentiment(text, model, tokenizer_obj, max_length)

# Print results
print(f"Sentence Sentiment: {final_results['sentence_sentiment']}")
print(f"Sarcasm: {final_results['sarcasm']}")
print(f"Emoji Sentiments: {final_results['emoji_sentiment']}")
print(f"Emoji Counts: {final_results['emoji_counts']}")
print(f"Final Sentiment: {final_results['final_sentiment']}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Sentence Sentiment: Negative
Sarcasm: Not Sarcastic
Emoji Sentiments: Neutral
Emoji Counts: {'positive_count': 0, 'negative_count': 0, 'neutral_count': 4}
Final Sentiment: Negative


In [3]:
import os
import re
import string
import emoji
import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:

# Define emoji mappings
positive_emojis = ["😀", "😃", "😄", "😁", "😆", "😅", "😂", "🤣", "😊", "😇", "🙂", "😉", "😌", "😍", "🥰", "😘", "😗", "😙", "😚", "😋", "😛", "😝", "😜", "🤪", "😎", "🤩", "🥳", "😏", "😬", "🤗"]
negative_emojis = ["😞", "😔", "😟", "😕", "🙁", "☹️", "😣", "😖", "😫", "😩", "🥺", "😢", "😭", "😤", "😠", "😡", "🤬", "🤯", "😳", "🥵", "🥶", "😱", "😨", "😰", "😥", "😓", "😈", "👿", "👹", "👺", "💩", "😿"]
neutral_emojis = ["🥹", "🥲", "☺️", "😐", "😑", "😶", "🙃", "😶‍🌫", "🤔", "🫣", "🤭", "🫡", "🫢", "🫡", "🤫", "🫠", "🤥", "😶", "🫥", "😐", "🫤", "😑", "🫨", "🙄", "😯", "😦", "😧", "😮", "😲", "🥱", "😴", "🤤", "😪", "😵", "🤐", "🥴", "🤢", "🤧", "😷", "🤒", "🤕", "🤑", "🤠"]

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Tokenize and clean text
def clean_tokenize(df):
    head_lines = []
    lines = df["headline"].values.tolist()
    stop_words = set(stopwords.words("english"))
    
    for line in lines:
        line = clean_text(line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha() and word not in stop_words]
        head_lines.append(words)
        
    return head_lines

# Define emoji and punctuation sentiment analysis function
def analyze_emoji_punctuation(text):
    emojis = ''.join(c for c in text if c in emoji.EMOJI_DATA)
    punctuations = ''.join(c for c in text if c in string.punctuation)
    
    emoji_counts = {
        'positive_count': sum(1 for e in emojis if e in positive_emojis),
        'negative_count': sum(1 for e in emojis if e in negative_emojis),
        'neutral_count': sum(1 for e in emojis if e in neutral_emojis)
    }
    
    # Determine emoji sentiment
    if emoji_counts['positive_count'] > emoji_counts['negative_count']:
        emoji_sentiment = 'Positive'
    elif emoji_counts['negative_count'] > emoji_counts['positive_count']:
        emoji_sentiment = 'Negative'
    else:
        emoji_sentiment = 'Neutral'
    
    if '!' in punctuations or '!!!' in punctuations:
        punctuation_sentiment = 'Positive'
    elif '...' in punctuations or '??' in punctuations:
        punctuation_sentiment = 'Negative'
    else:
        punctuation_sentiment = 'Neutral'
    
    return emoji_sentiment, punctuation_sentiment

# Define model prediction functions
def predict_sarcasm_sentiment(text, model, tokenizer, max_length):
    padded_input = preprocess_input(text, tokenizer, max_length)
    prediction = model.predict(padded_input)
    sentiment = 'Positive' if prediction >= 0.5 else 'Negative'
    sarcasm = 'Sarcastic' if prediction >= 0.5 else 'Not Sarcastic'
    return sentiment, sarcasm

def preprocess_input(text, tokenizer, max_length):
    tokens = tokenizer.texts_to_sequences([text])
    padded_input = pad_sequences(tokens, maxlen=max_length, padding='post')
    return padded_input

# Define final sentiment analysis function
def get_final_sentiment(text, model, tokenizer, max_length):
    # Detect emojis and classify sentiment using heuristics
    emoji_sentiment, punctuation_sentiment = analyze_emoji_punctuation(text)
    
    # Predict sarcasm and sentence sentiment using the model
    sentence_sentiment, sarcasm = predict_sarcasm_sentiment(text, model, tokenizer, max_length)
    
    # Combine results logically
    if sarcasm == 'Sarcastic':
        if emoji_sentiment == 'Positive' and punctuation_sentiment == 'Positive':
            final_sentiment = 'Sarcastic and Positive'
        elif emoji_sentiment == 'Negative' and punctuation_sentiment == 'Negative':
            final_sentiment = 'Sarcastic and Negative'
        elif emoji_sentiment == 'Positive' or punctuation_sentiment == 'Positive':
            final_sentiment = 'Sarcastic and Slightly Positive'
        elif emoji_sentiment == 'Negative' or punctuation_sentiment == 'Negative':
            final_sentiment = 'Sarcastic and Slightly Negative'
        else:
            final_sentiment = 'Sarcastic'
    else:
        if emoji_sentiment == 'Positive' and punctuation_sentiment == 'Positive':
            final_sentiment = 'Very Positive'
        elif emoji_sentiment == 'Negative' and punctuation_sentiment == 'Negative':
            final_sentiment = 'Very Negative'
        elif emoji_sentiment == 'Positive' or punctuation_sentiment == 'Positive':
            final_sentiment = 'Slightly Positive'
        elif emoji_sentiment == 'Negative' or punctuation_sentiment == 'Negative':
            final_sentiment = 'Slightly Negative'
        else:
            final_sentiment = sentence_sentiment
    
    return {
        'sentence_sentiment': sentence_sentiment,
        'sarcasm': sarcasm,
        'emoji_sentiment': emoji_sentiment,
        'punctuation_sentiment': punctuation_sentiment,
        'final_sentiment': final_sentiment
    }

# Load data (assuming you have a DataFrame 'data' with 'headline' and 'is_sarcastic' columns)
# data = pd.read_csv('your_data.csv')
# Read the JSON data using Pandas
data_1 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
data_2 = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)

data = pd.concat([data_1, data_2])
data = data.drop_duplicates()
#data.head()
#len(data)
# Tokenize and preprocess data
head_lines = clean_tokenize(data)
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(head_lines)
sequences = tokenizer_obj.texts_to_sequences(head_lines)
word_index = tokenizer_obj.word_index
vocab_size = len(word_index) + 1
max_length = 25

lines_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
sentiment = data['is_sarcastic'].values
print(sentiment)

# Example text
#text = "richard branson's global-warming donation nearly as much as cost of failed balloon trips😨😰😥😓😊❤️"
            #Sentence Sentiment: Positive
            #Sarcasm: Sarcastic
            #Emoji Sentiment: Negative
            #Punctuation Sentiment: Neutral
            #Final Sentiment: Sarcastic and Slightly Negative

text = "Oh, great. Another Monday. I was just hoping for a reason to hate my life more😣😖"

            # Sentence Sentiment: Negative
            # Sarcasm: Not Sarcastic
            # Emoji Sentiment: Negative
            # Punctuation Sentiment: Neutral
            # Final Sentiment: Slightly Negative

# Load the model
sarcasm_model_path = 'sentiment_sarcasm_model.h5'
sarcasm_model = load_model(sarcasm_model_path)

# Get final sentiment results
final_results = get_final_sentiment(text, sarcasm_model, tokenizer_obj, max_length)

# Print results
print(f"Sentence Sentiment: {final_results['sentence_sentiment']}")
print(f"Sarcasm: {final_results['sarcasm']}")
print(f"Emoji Sentiment: {final_results['emoji_sentiment']}")
print(f"Punctuation Sentiment: {final_results['punctuation_sentiment']}")
print(f"Final Sentiment: {final_results['final_sentiment']}")


[1 0 0 ... 0 1 1]




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
Sentence Sentiment: Negative
Sarcasm: Not Sarcastic
Emoji Sentiment: Negative
Punctuation Sentiment: Neutral
Final Sentiment: Slightly Negative


In [5]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

# After fitting tokenizer on texts
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(head_lines)  # Assuming head_lines is your preprocessed text data

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer_obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
