In [44]:
# Import necessary libraries
from google.colab import files
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import gensim.downloader as api
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
# Upload kaggle api from local machine to Google Colab
files.upload()

# Authentication credentials for accessing the Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json


In [46]:
# Download the dataset
!kaggle datasets download -d ankurzing/sentiment-analysis-for-financial-news
!unzip /content/sentiment-analysis-for-financial-news.zip

sentiment-analysis-for-financial-news.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /content/sentiment-analysis-for-financial-news.zip
replace FinancialPhraseBank/License.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: FinancialPhraseBank/License.txt  
  inflating: FinancialPhraseBank/README.txt  
  inflating: FinancialPhraseBank/Sentences_50Agree.txt  
  inflating: FinancialPhraseBank/Sentences_66Agree.txt  
  inflating: FinancialPhraseBank/Sentences_75Agree.txt  
  inflating: FinancialPhraseBank/Sentences_AllAgree.txt  
  inflating: all-data.csv            


In [47]:
# Create Pandas Dataframe
df = pd.read_csv('/content/FinancialPhraseBank/Sentences_75Agree.txt', sep='@', encoding='latin', header=None)

In [48]:
## Text preprocessing

# Convert all text to lowercase
df[0] = df[0].str.lower()

# Remove punctuation marks
df[0] = df[0].apply(lambda x: re.sub(r'[^\w\s]','',x))

# Tokenization
df[0] = df[0].apply(lambda x: word_tokenize(x))

# Stop word removal
stop_words = set(stopwords.words('english'))
df[0] = df[0].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming or Lemmatization
stemmer = PorterStemmer()
df[0] = df[0].apply(lambda x: [stemmer.stem(word) for word in x])

In [49]:
df.head()

Unnamed: 0,0,1
0,"[accord, gran, compani, plan, move, product, r...",neutral
1,"[new, product, plant, compani, would, increas,...",positive
2,"[last, quarter, 2010, componenta, net, sale, d...",positive
3,"[third, quarter, 2010, net, sale, increas, 52,...",positive
4,"[oper, profit, rose, eur, 131, mn, eur, 87, mn...",positive


In [50]:
# Load the GloVe 6B word embedding model
model = api.load("glove-wiki-gigaword-100")

In [51]:
# Define the size of the word embeddings
embedding_size = model.vector_size

# Create a matrix to hold the embeddings for each row
embedding_matrix = np.zeros((len(df), df[0].apply(lambda x: len(x)).max(), embedding_size))

# Iterate over each row in the DataFrame
for i, row in df.iterrows():
    # Iterate over each word in the row
    for j, word in enumerate(row[0]):
        try:
            # Look up the embedding for the word in the GloVe 6B model
            embedding = model[word]
            # Add the embedding to the matrix
            embedding_matrix[i, j, :] = embedding
        except:
            pass

In [52]:
# Define X and y
X = embedding_matrix
y = df[1]

# Reshape the input data
X = np.reshape(X, (X.shape[0], -1))
y = np.reshape(y, (-1,))

In [53]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

In [54]:
# Create a logistic regression with cross-entropy loss
logreg = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='auto')

# Train the model
logreg.fit(X_train, y_train)

In [55]:
# Make predictions on the testing data
y_pred = logreg.predict(X_test)

In [56]:
# Compute precision
precision = precision_score(y_test, y_pred, average='micro')

# Compute recall
recall = recall_score(y_test, y_pred, average='micro')

# Compute F1-score
f1 = f1_score(y_test, y_pred, average='micro')

# Print the results
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")

Precision: 0.71, Recall: 0.71, F1-score: 0.71


In [57]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=["positive", "neutral", "negative"])

# Print the confusion matrix
print(cm)

[[ 38  25  14]
 [ 30 196   8]
 [ 13   9  13]]


In [58]:
# Print details of each row and column in confusion matrix
for i, label in enumerate(["positive", "neutral", "negative"]):
    row = cm[i]
    print(f"\n{label} class:")
    print("True positives:", row[i])
    print("False positives:", sum(row) - row[i])
    print("False negatives:", sum(cm[:, i]) - row[i])
    print("True negatives:", sum(cm.flatten()) - sum(row) - sum(cm[:, i]) + row[i])


positive class:
True positives: 38
False positives: 39
False negatives: 43
True negatives: 226

neutral class:
True positives: 196
False positives: 38
False negatives: 34
True negatives: 78

negative class:
True positives: 13
False positives: 22
False negatives: 22
True negatives: 289


In [59]:
# print the count of rows for each class in the 'Class' column
print(y_test.value_counts())

neutral     234
positive     77
negative     35
Name: 1, dtype: int64


In [60]:
# Scale between 0 and 1
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [61]:
# Create a logistic regression with cross-entropy loss
gnb = MultinomialNB()

# Train the model
gnb.fit(X_train, y_train)

In [62]:
# Make predictions on the testing data
y_pred = gnb.predict(X_test)

In [63]:
# Compute precision
precision = precision_score(y_test, y_pred, average='micro')

# Compute recall
recall = recall_score(y_test, y_pred, average='micro')

# Compute F1-score
f1 = f1_score(y_test, y_pred, average='micro')

# Print the results
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")

Precision: 0.71, Recall: 0.71, F1-score: 0.71


In [64]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=["positive", "neutral", "negative"])

# Print the confusion matrix
print(cm)

[[ 27  47   3]
 [ 17 217   0]
 [ 11  22   2]]


In [65]:
# Print details of each row and column in confusion matrix
for i, label in enumerate(["positive", "neutral", "negative"]):
    row = cm[i]
    print(f"\n{label} class:")
    print("True positives:", row[i])
    print("False positives:", sum(row) - row[i])
    print("False negatives:", sum(cm[:, i]) - row[i])
    print("True negatives:", sum(cm.flatten()) - sum(row) - sum(cm[:, i]) + row[i])


positive class:
True positives: 27
False positives: 50
False negatives: 28
True negatives: 241

neutral class:
True positives: 217
False positives: 17
False negatives: 69
True negatives: 43

negative class:
True positives: 2
False positives: 33
False negatives: 3
True negatives: 308
