In [2]:
import pandas as pd
from snorkel.labeling import LabelingFunction
from snorkel.preprocess import preprocessor
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
df = pd.read_csv(r"C:\Users\chara\Downloads\abcnews-date-text.csv")

In [4]:
df = df.drop(['publish_date'], axis=1)
df = df.rename(columns={'headline_text': 'text'})
df['text'] = df['text'].astype(str)

# Check the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1244184 non-null  object
dtypes: object(1)
memory usage: 9.5+ MB


In [5]:
analyzer = SentimentIntensityAnalyzer()

In [6]:
POSITIVE = 1
NEGATIVE = 2
NEUTRAL = 0
ABSTAIN = -1

In [7]:
# Set up a preprocessor function to determine polarity using VADER
@preprocessor(memoize=True)
def vader_sentiment(x):
    scores = analyzer.polarity_scores(x.text)
    x.compound = scores['compound']
    return x

# Labeling function using VADER compound score
@labeling_function(pre=[vader_sentiment])
def vader_polarity(x):
    if x.compound >= 0.05:
        return POSITIVE
    elif x.compound <= -0.05:
        return NEGATIVE
    else:
        return NEUTRAL

# Additional keyword-based labeling functions
@labeling_function()
def keyword_positive(x):
    keywords = ["good", "great", "excellent", "positive", "fortunate", "correct", "superior"]
    return POSITIVE if any(word in x.text.lower() for word in keywords) else ABSTAIN

@labeling_function()
def keyword_negative(x):
    keywords = ["bad", "terrible", "awful", "negative", "unfortunate", "wrong", "inferior"]
    return NEGATIVE if any(word in x.text.lower() for word in keywords) else ABSTAIN


In [8]:
# Combine all the labeling functions
lfs = [vader_polarity, keyword_positive, keyword_negative]

# Apply the LFs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)

# Apply the label model
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_snorkel)

100%|███████████████████████████████████████████████████████████████████████| 1244184/1244184 [21:53<00:00, 947.00it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                                       | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.296]
  8%|██████▎                                                                        | 8/100 [00:00<00:01, 78.75epoch/s]INFO:root:[10 epochs]: TRAIN:[loss=0.162]
INFO:root:[20 epochs]: TRAIN:[loss=0.035]
INFO:root:[30 epochs]: TRAIN:[loss=0.001]
INFO:root:[40 epochs]: TRAIN:[loss=0.001]
 43%|█████████████████████████████████                                            | 43/100 [00:00<00:00, 234.33epoch/s]INFO:root:[50 epochs]: TRAIN:[loss=0.002]
INFO:root:[60 epochs]: TRAIN:[loss=0.001]
INFO:root:[70 epochs]: TRAIN:[loss=0.000]
 79%|████████████████████████████████████████████████████████████▊                | 79/100 [00:00<00:00, 289.61epoch/s]INFO:root:[80 ep

In [9]:
df["label"] = label_model.predict(L=L_snorkel)

In [10]:
df = df.loc[df.label.isin([0, 1, 2]), :]

In [11]:
df['label'].value_counts()

label
0    562599
2    425745
1    255840
Name: count, dtype: int64

In [12]:
# Store headlines and labels in respective lists
text = list(df['text'])
labels = list(df['label'])

In [22]:
# Split data into training and testing sets
training_text = text[:400000]
testing_text = text[400000:600000]
training_labels = labels[:400000]
testing_labels = labels[400000:600000]

In [23]:
# Preprocess text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(training_text)
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')


In [24]:
# Convert lists into numpy arrays to make it work with TensorFlow
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [25]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [26]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [27]:
# Model summary
model.summary()

In [28]:
# Train the model
num_epochs = 10
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

Epoch 1/10
12500/12500 - 62s - 5ms/step - accuracy: 0.7523 - loss: 0.5889 - val_accuracy: 0.9267 - val_loss: 0.2871
Epoch 2/10
12500/12500 - 61s - 5ms/step - accuracy: 0.9304 - loss: 0.2414 - val_accuracy: 0.9303 - val_loss: 0.2519
Epoch 3/10
12500/12500 - 62s - 5ms/step - accuracy: 0.9445 - loss: 0.1978 - val_accuracy: 0.8679 - val_loss: 0.3589
Epoch 4/10
12500/12500 - 62s - 5ms/step - accuracy: 0.9497 - loss: 0.1784 - val_accuracy: 0.9506 - val_loss: 0.1809
Epoch 5/10
12500/12500 - 63s - 5ms/step - accuracy: 0.9555 - loss: 0.1567 - val_accuracy: 0.9548 - val_loss: 0.1630
Epoch 6/10
12500/12500 - 63s - 5ms/step - accuracy: 0.9585 - loss: 0.1447 - val_accuracy: 0.9583 - val_loss: 0.1549
Epoch 7/10
12500/12500 - 61s - 5ms/step - accuracy: 0.9603 - loss: 0.1376 - val_accuracy: 0.9490 - val_loss: 0.1712
Epoch 8/10
12500/12500 - 63s - 5ms/step - accuracy: 0.9617 - loss: 0.1322 - val_accuracy: 0.9536 - val_loss: 0.1732
Epoch 9/10
12500/12500 - 63s - 5ms/step - accuracy: 0.9638 - loss: 0.125

In [45]:
def predict_label(model, tokenizer, text, max_len=120):
    # Preprocess the input text
    sequences = tokenizer.texts_to_sequences([text])
    padded_seqs = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    
    # Predict the class probabilities
    prediction = model.predict(padded_seqs)
    
    # Convert probabilities to class label
    class_labels = {0: 'NEUTRAL', 1: 'POSITIVE', 2: 'NEGATIVE'}
    predicted_label = class_labels[np.argmax(prediction)]
    
    return predicted_label

# Example usage
new_text = "The US imposes sanctions on Russia because of the Ukrainian war"
predicted_label = predict_label(model, tokenizer, new_text)
print(f"Predicted label: {predicted_label}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Predicted label: NEGATIVE


In [46]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [47]:
def calculate_error(model, tokenizer, texts, labels, max_len=120):
    # Convert texts to sequences and pad them
    sequences = tokenizer.texts_to_sequences(texts)
    padded_seqs = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    
    # Predict class probabilities
    predictions = model.predict(padded_seqs)
    
    # Convert probabilities to class labels
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, predicted_labels)
    
    # Calculate precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='weighted')
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    
    return accuracy, precision, recall, f1

# Example usage
accuracy, precision, recall, f1 = calculate_error(model, tokenizer, testing_text, testing_labels)

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 3ms/step
Accuracy: 0.9565
Precision: 0.9567
Recall: 0.9565
F1 Score: 0.9563


In [50]:
model.save(r'C:\Users\chara\Documents\sentiment_model.keras')

In [51]:
import pickle

# Save the tokenizer to a file
with open(r'C:\Users\chara\Documents\tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [1]:
compiled_metrics = model.metrics_names
print("Compiled Metrics:", compiled_metrics)

NameError: name 'model' is not defined