In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.svm import SVC  # Support Vector Machine
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [26]:
import pandas as pd

# List of encodings to try
encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'windows-1252']  # Add other encodings if necessary

# Try reading the file using different encodings
for encoding in encodings:
    try:
        data = pd.read_csv("US-Economic-News.csv", encoding=encoding)
        print(f"File read successfully with encoding: {encoding}")
        break  # Stop trying different encodings once a successful read occurs
    except Exception as e:
        print(f"Error reading with encoding {encoding}: {e}")


Error reading with encoding utf-8: 'utf-8' codec can't decode byte 0x89 in position 63094: invalid start byte
File read successfully with encoding: latin1


In [27]:
data.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
0,842613455,False,finalized,3,12/5/15 17:48,3.0,0.64,yes,0.64,wsj_398217788,8/14/91,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...
1,842613456,False,finalized,3,12/5/15 16:54,,,no,1.0,wsj_399019502,8/21/07,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal Online</br></br>The Mo...
2,842613457,False,finalized,3,12/5/15 1:59,,,no,1.0,wsj_398284048,11/14/91,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...
3,842613458,False,finalized,3,12/5/15 2:19,,0.0,no,0.675,wsj_397959018,6/16/86,Manager's Journal: Sniffing Out Drug Abusers I...,,,The statistics on the enormous costs of employ...
4,842613459,False,finalized,3,12/5/15 17:48,3.0,0.3257,yes,0.64,wsj_398838054,10/4/02,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollar's ton...


In [28]:
# Select relevant columns
data = data[['text', 'relevance']]

In [29]:
data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,yes
1,The Wall Street Journal Online</br></br>The Mo...,no
2,WASHINGTON -- In an effort to achieve banking ...,no
3,The statistics on the enormous costs of employ...,no
4,NEW YORK -- Indecision marked the dollar's ton...,yes


In [30]:
data['relevance'] = data['relevance'].map({'yes': 1, 'no': 0})

In [31]:
nan_count = data['relevance'].isna().sum()
print("Number of NaN values in 'relevance' column:", nan_count)


Number of NaN values in 'relevance' column: 9


In [32]:
# data = data.dropna(subset=['relevance'])
data = data.dropna()


In [33]:
data.isnull().sum()

text         0
relevance    0
dtype: int64

In [34]:
# Data Preprocessing - if necessary
# You might need to clean the text, tokenize it, and use TF-IDF vectorization
# For example:

# feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

tfidf = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)
X = tfidf.fit_transform(data['text'])

In [35]:
data.head()

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,1.0
1,The Wall Street Journal Online</br></br>The Mo...,0.0
2,WASHINGTON -- In an effort to achieve banking ...,0.0
3,The statistics on the enormous costs of employ...,0.0
4,NEW YORK -- Indecision marked the dollar's ton...,1.0


In [36]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['relevance'], test_size=0.2, random_state=42)


In [37]:
# Implement Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [38]:
# Make predictions
predictions = nb_classifier.predict(X_test)


In [39]:
# Evaluation
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, zero_division=1)
conf_matrix = confusion_matrix(y_test, predictions)


In [40]:
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8136335209505942
Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      1.00      0.90      1302
         1.0       0.00      0.00      0.00       297

    accuracy                           0.81      1599
   macro avg       0.41      0.50      0.45      1599
weighted avg       0.66      0.81      0.73      1599

Confusion Matrix:
 [[1301    1]
 [ 297    0]]


In [41]:
t=['NEW YORK -- Yields on most certificates of de']

z = tfidf.transform(t)

In [42]:
nb_classifier.predict(z)

array([0.])

# SVM

In [43]:

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions, zero_division=1)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)

In [44]:
print("Support Vector Machine (SVM) Classifier:")
print("Accuracy:", svm_accuracy)
print("Classification Report:\n", svm_report)
print("Confusion Matrix:\n", svm_conf_matrix)
print("\n")

Support Vector Machine (SVM) Classifier:
Accuracy: 0.815509693558474
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.99      0.90      1302
         1.0       0.53      0.06      0.11       297

    accuracy                           0.82      1599
   macro avg       0.68      0.52      0.50      1599
weighted avg       0.77      0.82      0.75      1599

Confusion Matrix:
 [[1286   16]
 [ 279   18]]




# Decision Trees

In [45]:
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, y_train)
tree_predictions = tree_classifier.predict(X_test)
tree_accuracy = accuracy_score(y_test, tree_predictions)
tree_report = classification_report(y_test, tree_predictions, zero_division=1)
tree_conf_matrix = confusion_matrix(y_test, tree_predictions)

In [46]:
print("Decision Tree Classifier:")
print("Accuracy:", tree_accuracy)
print("Classification Report:\n", tree_report)
print("Confusion Matrix:\n", tree_conf_matrix)
print("\n")

Decision Tree Classifier:
Accuracy: 0.7554721701063164
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.87      0.85      1302
         1.0       0.31      0.26      0.29       297

    accuracy                           0.76      1599
   macro avg       0.57      0.57      0.57      1599
weighted avg       0.74      0.76      0.75      1599

Confusion Matrix:
 [[1130  172]
 [ 219   78]]




# Logistic Regression

In [47]:
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
lr_predictions = lr_classifier.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_report = classification_report(y_test, lr_predictions, zero_division=1)
lr_conf_matrix = confusion_matrix(y_test, lr_predictions)

In [48]:
print("Logistic Regression Classifier:")
print("Accuracy:", lr_accuracy)
print("Classification Report:\n", lr_report)
print("Confusion Matrix:\n", lr_conf_matrix)

Logistic Regression Classifier:
Accuracy: 0.815509693558474
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.99      0.90      1302
         1.0       0.53      0.06      0.11       297

    accuracy                           0.82      1599
   macro avg       0.68      0.52      0.50      1599
weighted avg       0.77      0.82      0.75      1599

Confusion Matrix:
 [[1286   16]
 [ 279   18]]
