In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
file_path = 'news_sentiment_analysis.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary of the dataset
data.head()

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business
2,4-traders,,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,2024-07-12T22:52:55+00:00,positive,Business
3,4-traders,,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,2024-07-12T22:41:01+00:00,negative,Business
4,PLANET,,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,2024-07-12T22:28:19+00:00,positive,Business


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Source        3500 non-null   object
 1   Author        2512 non-null   object
 2   Title         3500 non-null   object
 3   Description   3500 non-null   object
 4   URL           3500 non-null   object
 5   Published At  3500 non-null   object
 6   Sentiment     3500 non-null   object
 7   Type          3500 non-null   object
dtypes: object(8)
memory usage: 218.9+ KB


In [6]:
data['Type'].value_counts()

Unnamed: 0_level_0,count
Type,Unnamed: 1_level_1
Business,500
Entertainment,500
General,500
Health,500
Science,500
Sports,500
Technology,500


In [7]:
data['Description'] = data['Description'].astype(str)
texts = data['Description']
labels = data['Sentiment']

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [9]:
encoded_labels

array([2, 1, 2, ..., 2, 1, 2])

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

In [12]:
X_train_tfidf  = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test.astype(str))

In [13]:
from sklearn.ensemble import RandomForestClassifier

model_forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [14]:
model_forest.fit(X_train_tfidf, y_train)

In [15]:
y_pred_forest = model_forest.predict(X_test_tfidf)

In [16]:
from sklearn.metrics import classification_report, accuracy_score

In [17]:
accuracy_Forest = accuracy_score(y_test, y_pred_forest)

In [18]:
report_Forest = classification_report(y_test, y_pred_forest, target_names=label_encoder.classes_)

In [19]:
print(f'Accuracy: {accuracy_Forest}')
print('Classification Report:')
print(report_Forest)

Accuracy: 0.8642857142857143
Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.61      0.71       115
     neutral       0.82      0.85      0.83       158
    positive       0.88      0.94      0.91       427

    accuracy                           0.86       700
   macro avg       0.85      0.80      0.82       700
weighted avg       0.86      0.86      0.86       700



In [20]:
n_estimators = [100, 200, 300, 400, 500]
max_depth = [None, 10, 20, 30, 40, 50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
max_features = ['sqrt', 'log2']
bootstrap = [True, False]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'max_features': max_features,
    'bootstrap': bootstrap
}

In [21]:
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=model_forest,
                               param_distributions=hyperparameter_grid,
                               cv=5,
                               n_iter=50,
                               scoring='accuracy',  # use accuracy for classification
                               n_jobs=-1,
                               verbose=5,
                               return_train_score=True,
                               random_state=42)

In [22]:
random_cv.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [23]:
best_params = random_cv.best_params_
best_score = random_cv.best_score_

In [24]:
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Best parameters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: 0.8596428571428572


In [25]:
best_model_forest = random_cv.best_estimator_
y_pred = best_model_forest.predict(X_test_tfidf)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f'Accuracy on test set: {accuracy}')
print('Classification Report:')
print(report)

Accuracy on test set: 0.87
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.63      0.74       115
     neutral       0.82      0.83      0.82       158
    positive       0.88      0.95      0.92       427

    accuracy                           0.87       700
   macro avg       0.87      0.80      0.83       700
weighted avg       0.87      0.87      0.87       700



In [27]:
import pickle

# Save the model to a file
filename = 'sentiment_analysis_model.h5'
pickle.dump(best_model_forest, open(filename, 'wb'))


In [28]:
loaded_model = pickle.load(open('sentiment_analysis_model.h5', 'rb'))
text_input = "This is a great product! I am very happy with it."
text_input_tfidf = vectorizer.transform([text_input])
prediction = loaded_model.predict(text_input_tfidf)
predicted_sentiment = label_encoder.inverse_transform(prediction)[0]
print(f"Predicted sentiment: {predicted_sentiment}")


Predicted sentiment: positive


In [30]:
# prompt: give a  very negative news to predict

text_input = "The company's stock plummeted after a disastrous earnings report, leading to widespread layoffs and a loss of investor confidence."
text_input_tfidf = vectorizer.transform([text_input])
prediction = loaded_model.predict(text_input_tfidf)
predicted_sentiment = label_encoder.inverse_transform(prediction)[0]
print(f"Predicted sentiment: {predicted_sentiment}")


Predicted sentiment: neutral


In [32]:

filename = 'sentiment_analysis_model.h5'
pickle.dump(best_model_forest, open(filename, 'wb'))
