In [8]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
import scipy.sparse

In [9]:
# Load and preprocess the data
df = pd.read_csv('Data.csv', encoding='ISO-8859-1')

In [10]:
# Split data into training and testing sets based on date
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [11]:
# Prepare text data - concatenate headlines into a single string for each row
train_headlines = [' '.join(str(x) for x in train.iloc[row, 2:27]) for row in range(len(train))]
test_headlines = [' '.join(str(x) for x in test.iloc[row, 2:27]) for row in range(len(test))]


In [12]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.9)
X_train = vectorizer.fit_transform(train_headlines)
X_test = vectorizer.transform(test_headlines)

In [13]:
# Calculate sentiment polarity using TextBlob for each headline and add as a feature
train_sentiment = [TextBlob(headline).sentiment.polarity for headline in train_headlines]
test_sentiment = [TextBlob(headline).sentiment.polarity for headline in test_headlines]


In [14]:
# Stack sentiment features with TF-IDF features
X_train = scipy.sparse.hstack((X_train, scipy.sparse.csr_matrix(train_sentiment).T))
X_test = scipy.sparse.hstack((X_test, scipy.sparse.csr_matrix(test_sentiment).T))

In [15]:
# Extract labels for training and testing
y_train = train['Label']
y_test = test['Label']

In [16]:
# Initialize and train the XGBoost classifier
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)


In [17]:
# Make predictions on the test set
predictions = xgb_model.predict(X_test)

In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

# Display the results
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8412698412698413
Confusion Matrix:
[[152  34]
 [ 26 166]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       186
           1       0.83      0.86      0.85       192

    accuracy                           0.84       378
   macro avg       0.84      0.84      0.84       378
weighted avg       0.84      0.84      0.84       378

