In [8]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
import scipy.sparse

In [9]:
# Load and preprocess the data
df = pd.read_csv('Data.csv', encoding='ISO-8859-1')

In [10]:
# Split data into training and testing sets based on date
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [11]:
# Prepare text data - concatenate headlines into a single string for each row
train_headlines = [' '.join(str(x) for x in train.iloc[row, 2:27]) for row in range(len(train))]
test_headlines = [' '.join(str(x) for x in test.iloc[row, 2:27]) for row in range(len(test))]

In [12]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.9)
X_train = vectorizer.fit_transform(train_headlines)
X_test = vectorizer.transform(test_headlines)

In [13]:
# Calculate sentiment polarity using TextBlob for each headline and add as a feature
train_sentiment = [TextBlob(headline).sentiment.polarity for headline in train_headlines]
test_sentiment = [TextBlob(headline).sentiment.polarity for headline in test_headlines]

In [14]:
# Normalize sentiment polarity values to be non-negative
train_sentiment = [(sentiment + 1) / 2 for sentiment in train_sentiment]
test_sentiment = [(sentiment + 1) / 2 for sentiment in test_sentiment]

In [15]:
# Stack sentiment features with TF-IDF features
X_train = scipy.sparse.hstack((X_train, scipy.sparse.csr_matrix(train_sentiment).T))
X_test = scipy.sparse.hstack((X_test, scipy.sparse.csr_matrix(test_sentiment).T))

In [16]:
# Extract labels for training and testing
y_train = train['Label']
y_test = test['Label']


In [17]:
# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [18]:
# Make predictions on the test set
predictions = nb_model.predict(X_test)

In [19]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

In [20]:
# Display the results
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.6693121693121693
Confusion Matrix:
[[ 65 121]
 [  4 188]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.35      0.51       186
           1       0.61      0.98      0.75       192

    accuracy                           0.67       378
   macro avg       0.78      0.66      0.63       378
weighted avg       0.77      0.67      0.63       378



In [21]:
%pip install imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


In [22]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
import scipy.sparse
from imblearn.over_sampling import SMOTE

# Load and preprocess the data
df = pd.read_csv('Data.csv', encoding='ISO-8859-1')

# Split data into training and testing sets based on date
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

# Prepare text data - concatenate headlines into a single string for each row
train_headlines = [' '.join(str(x) for x in train.iloc[row, 2:27]) for row in range(len(train))]
test_headlines = [' '.join(str(x) for x in test.iloc[row, 2:27]) for row in range(len(test))]

# Convert text data into TF-IDF features with increased max_features and ngram_range
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), min_df=2, max_df=0.9)
X_train = vectorizer.fit_transform(train_headlines)
X_test = vectorizer.transform(test_headlines)

# Calculate sentiment polarity using TextBlob for each headline and add as a feature
train_sentiment = [TextBlob(headline).sentiment.polarity for headline in train_headlines]
test_sentiment = [TextBlob(headline).sentiment.polarity for headline in test_headlines]

# Normalize sentiment polarity values to be non-negative
train_sentiment = [(sentiment + 1) / 2 for sentiment in train_sentiment]
test_sentiment = [(sentiment + 1) / 2 for sentiment in test_sentiment]

# Stack sentiment features with TF-IDF features
X_train = scipy.sparse.hstack((X_train, scipy.sparse.csr_matrix(train_sentiment).T))
X_test = scipy.sparse.hstack((X_test, scipy.sparse.csr_matrix(test_sentiment).T))

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, train['Label'])

# Initialize and train the Bernoulli Naive Bayes classifier
nb_model = BernoulliNB()
nb_model.fit(X_train_balanced, y_train_balanced)

# Make predictions on the test set
predictions = nb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(test['Label'], predictions)
conf_matrix = confusion_matrix(test['Label'], predictions)
class_report = classification_report(test['Label'], predictions)

# Display the results
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Accuracy: 0.626984126984127
Confusion Matrix:
[[183   3]
 [138  54]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.98      0.72       186
           1       0.95      0.28      0.43       192

    accuracy                           0.63       378
   macro avg       0.76      0.63      0.58       378
weighted avg       0.76      0.63      0.58       378

