In [22]:
# importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk, re, string
from string import punctuation
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import itertools
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from textblob import TextBlob

In [2]:
df = pd.read_csv("C:/Users/Adithya/OneDrive/Documents/Projects/Project-2/financial_sentiment_data.csv", index_col = False)

In [3]:
#showing the duplicated data
df[df.duplicated()]

Unnamed: 0,Sentence,Sentiment
1825,Proha Plc ( Euronext :7327 ) announced today (...,neutral
1859,SSH Communications Security Corporation is hea...,neutral
2672,Ahlstrom 's share is quoted on the NASDAQ OMX ...,neutral
3272,The company serves customers in various indust...,neutral
3986,The issuer is solely responsible for the conte...,neutral
5342,The report profiles 614 companies including ma...,neutral


In [4]:
df.drop_duplicates(inplace=True)

In [5]:
# Group the DataFrame by "Sentence" and count the number of occurrences of each sentence
df_grouped = df.groupby("Sentence").agg({"Sentiment": "nunique"}).reset_index()

In [6]:
df_duplicates = df_grouped[df_grouped["Sentiment"] > 1]

In [7]:
# Merge the filtered DataFrame with the original DataFrame to obtain the repeated sentences and their corresponding sentiments
df_repeated = pd.merge(df, df_duplicates, on="Sentence", how="inner")

In [8]:
# Remove the repeated sentences from the original DataFrame
df.drop_duplicates(subset="Sentence", keep=False, inplace=True)

In [9]:
df_repeated.drop("Sentiment_x",axis=1, inplace=True)

In [10]:
df_repeated.drop("Sentiment_y",axis=1, inplace=True)

In [11]:
df_repeated.drop_duplicates(inplace=True)

In [12]:
df_repeated.describe()

Unnamed: 0,Sentence
count,514
unique,514
top,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...
freq,1


In [13]:
def text_normalization(text):
    #1) converting all characters to lower case
    text = text.lower()
    #2) removing punctuations
    punc = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(punc)
    #2) removing extra white spaces
    text = re.sub('\s+', ' ', text).strip()
    #4) removing special characters
    text = re.sub('[^A-Za-z0-9\s]+', '', text)
    #5) removing numbers
    text = re.sub('\d+', '', text)
    ###5) removing punctuations
    ###text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #6) removing links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    #7) Deleting newlines
    text = re.sub('\n', '', text)
    return text

In [14]:
df['Sentence'] = df['Sentence'].apply(text_normalization)

In [15]:
df_repeated['Sentence'] = df_repeated['Sentence'].apply(text_normalization)

In [16]:
lemmatizer = WordNetLemmatizer()
Stopwords = set(nltk.corpus.stopwords.words("english")) - set(["not"])

In [17]:
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in Stopwords ]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    text = " ".join(tokens)
    return text

In [18]:
df["Sentence"] = df['Sentence'].apply(preprocess_text)

In [19]:
df_repeated["Sentence"] = df_repeated['Sentence'].apply(preprocess_text)

In [23]:
def getPolarity(Clean_reviews):
  return TextBlob(Clean_reviews).sentiment.polarity
df_repeated['Polarity'] = df_repeated['Sentence'].apply(getPolarity)

In [24]:
# Define the threshold values for each sentiment category
pos_threshold = 0.1
neg_threshold = -0.05 # while taking -0.1 we are getting 8.49%
# Define a function to apply the thresholding and return the predicted sentiment
def get_sentiment(polarity):
    if polarity >= pos_threshold:
        return 'positive'
    elif polarity <= neg_threshold:
        return 'negative'
    else:
        return 'neutral'

In [25]:
df_repeated['Sentiment'] = df_repeated['Polarity'].apply(get_sentiment)

In [26]:
df_repeated.drop('Polarity',axis=1, inplace=True)

In [27]:
df_repeated.head(5)

Unnamed: 0,Sentence,Sentiment
0,ssh communication security corp stock exchange...,positive
2,viking line canceled service,neutral
4,dolce gabbana asked european union declare mar...,neutral
6,diluted earnings per share eps fell eur eur,neutral
8,finnish geosentric net sale decreased eur janu...,negative


In [28]:
df3 = pd.concat([df, df_repeated])

In [29]:
df3 = df3.reset_index(drop=True)

In [30]:
df_pos = df3[df3['Sentiment']=='positive']
df_neu = df3[df3["Sentiment"]=="neutral"]
df_neg = df3[df3['Sentiment']=='negative']

In [31]:
tokenized_text_pos = ' '.join(df_pos['Sentence']).split()

In [32]:
from collections import Counter
word_counts_pos = Counter(tokenized_text_pos)

In [33]:
tokenized_text_neu = ' '.join(df_neu['Sentence']).split()
word_counts_neu = Counter(tokenized_text_neu)

In [34]:
tokenized_text_neg = ' '.join(df_neg['Sentence']).split()
word_counts_neg = Counter(tokenized_text_neg)

In [35]:
df_all = df3["Sentence"]
tokenized_text_all = ' '.join(df_all).split()
word_counts_all = Counter(tokenized_text_all)

In [36]:
# here we get output type is list

# finding unique items for postive sentiment
tokenized_text_pos
unique_items_pos = []
for item in tokenized_text_pos:
    if item not in unique_items_pos:
        unique_items_pos.append(item)
print(unique_items_pos)
# finding unique items for negative sentiment
tokenized_text_neg
unique_items_neg = []
for item in tokenized_text_neg:
    if item not in unique_items_neg:
        unique_items_neg.append(item)
print(unique_items_neg)
# finding unique items for neutral sentiment
tokenized_text_neu
unique_items_neu = []
for item in tokenized_text_neu:
    if item not in unique_items_neu:
        unique_items_neu.append(item)
print(unique_items_neu)
# the words which are common in the neutral, negative, sentiment
common_words = []
for word in unique_items_pos:
    if word in unique_items_neg and word in unique_items_neu:
        common_words.append(word)





In [37]:
word_counts_all = Counter(tokenized_text_all)

In [38]:
counts_dict_all = dict(word_counts_all)

In [39]:
# converting dictonary to data frames for all words
df_1 = pd.DataFrame.from_dict(counts_dict_all, orient = 'index', columns = ['count of words'])
df_1.reset_index(inplace= True)

In [40]:
df_1.rename(columns={'index':'words'}, inplace = True)

In [41]:
df_1 = df_1.sort_values('count of words', ascending = False)

In [42]:
#  filtering the words
filtered_df = df_1[df_1["words"].isin(common_words)]

In [43]:
df_2 = filtered_df.head(75)

In [44]:
df_2 = df_2.drop('count of words', axis=1)

In [45]:
words_75 = df_2['words'].tolist()

In [46]:
# removing the 75 words in pos
pos_words = [word for word in words_75 if word not in tokenized_text_pos] + [word for word in tokenized_text_pos if word not in words_75]
# removing the 75 words in neg
neg_words = [word for word in words_75 if word not in tokenized_text_neg] + [word for word in tokenized_text_neg if word not in words_75]
# removing the 75 words in neu
neu_words = [word for word in words_75 if word not in tokenized_text_neu] + [word for word in tokenized_text_neu if word not in words_75]
# removing the 75 words in all
all_words = [word for word in words_75 if word not in tokenized_text_all] + [word for word in tokenized_text_all if word not in words_75]

In [47]:
tokens_pos = [word_tokenize(review) for review in pos_words]
pos_tokens = [[word for word in item if len(word) > 2] for item in tokens_pos]
last_token = list(itertools.chain(*pos_tokens)) 

In [49]:
tokens_neg = [word_tokenize(review) for review in neg_words]
neg_tokens = [[word for word in item if len(word) > 2] for item in tokens_neg]
last_token = list(itertools.chain(*neg_tokens))

In [50]:
tokens_neu = [word_tokenize(review) for review in neu_words]
neu_tokens = [[word for word in item if len(word) > 2] for item in tokens_neu]
last_token = list(itertools.chain(*neu_tokens)) 

In [51]:
wc_pos_words = Counter(pos_words)

In [52]:
wc_neg_words = Counter(neg_words)

In [53]:
wc_neu_words = Counter(neu_words)

In [54]:
wc_all_words = Counter(all_words)

In [55]:
#removing the words after removing 75 words
def remove_words_not_in_set(sentence, word_set):
    # Split sentence into list of words
    words = sentence.split()
    # Filter out words not in set
    filtered_words = [word for word in words if word in word_set]
    # Join filtered words back into a sentence
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence

In [56]:
# Apply function to each sentence in data
filtered_sentences = []
for sentence, sentiment in zip(df3['Sentence'], df3['Sentiment']):
    if sentiment == 'positive':
        filtered_sentence = remove_words_not_in_set(sentence, pos_words)# any dupicates found change this pos_words = uni_pos
    elif sentiment == 'negative':
        filtered_sentence = remove_words_not_in_set(sentence, neg_words)
    else:
        filtered_sentence = remove_words_not_in_set(sentence, neu_words)
    filtered_sentences.append(filtered_sentence)

In [57]:
df3['Filtered_Sentence'] = filtered_sentences

In [58]:
df3 = df3[["Filtered_Sentence", "Sentiment"]]

In [59]:
# Convert sentiment labels to numerical values
df3["Sentiment"] = df3["Sentiment"].map({"negative": 0, "positive": 1, "neutral": 2})

In [60]:
X = df3['Filtered_Sentence']
y = df3['Sentiment']

In [61]:
# Split the data into training and testing sets
X_train_t_xg, X_test_t_xg, y_train_t_xg, y_test_t_xg = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
from xgboost import XGBClassifier

In [66]:
shapes_t_xg = {}
# Vectorize the features using a CountVectorizer with the current ngram_range and max_features=10000
tfidf_vectorizer  = TfidfVectorizer(ngram_range=(1,3), max_features=10000)
X_train_transformed_t_xg = tfidf_vectorizer.fit_transform(X_train_t_xg)
X_test_transformed_t_xg = tfidf_vectorizer.transform(X_test_t_xg)
    
    # Store the shape of the transformed features in the shapes dictionary
shapes_t_xg[(1,3)] = X_train_transformed_t_xg.shape
    
    # Calculate the class weights
class_weights = {
    2: 1.0,  # Neutral class weight
    1: len(y_train_t_xg) / sum(y_train_t_xg==1),  # Positive class weight
    0: len(y_train_t_xg) / sum(y_train_t_xg==0)  # Negative class weight
}
    # Initialize a Naive Bayes model with weighted loss function
model = XGBClassifier(learning_rate=0.2,max_depth=30, n_estimators=1000, gamma=0.5, reg_alpha=0.5)

    # Fit the model with weighted loss function
model.fit(X_train_transformed_t_xg, y_train_t_xg, sample_weight=[class_weights[c] for c in y_train_t_xg])

    # Predict the sentiment for the testing set
y_pred_t_xg = model.predict(X_test_transformed_t_xg)
print(f"y_pred shape: {y_pred_t_xg.shape}")
    # Calculate evaluation metrics
accuracy = accuracy_score(y_test_t_xg, y_pred_t_xg)
precision = precision_score(y_test_t_xg, y_pred_t_xg, average='weighted')
recall = recall_score(y_test_t_xg, y_pred_t_xg, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(metrics.classification_report(y_test_t_xg, y_pred_t_xg))    
print('------------------------')

y_pred shape: (1065,)
Accuracy: 0.6938967136150235
Precision: 0.7095212937534215
Recall: 0.6938967136150235
              precision    recall  f1-score   support

           0       0.55      0.53      0.54        95
           1       0.58      0.70      0.64       372
           2       0.81      0.71      0.76       598

    accuracy                           0.69      1065
   macro avg       0.65      0.65      0.65      1065
weighted avg       0.71      0.69      0.70      1065

------------------------


In [67]:
import pickle

In [68]:
pickle_out = open("xgrab.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()

In [69]:
pickle_out = open("termf.pkl","wb")
pickle.dump(tfidf_vectorizer, pickle_out)
pickle_out.close()