In [1]:
#Load packages
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.svm import SVC
import re

In [2]:
#Load dataset
df = pd.read_csv("V:\GOLD SENTIMENT ANALYSIS/gold headlines.csv")

## The Price Sentiment column contains 4 classes (positive, neutral, negative and none)
* Positive denotes upward movement in price
* Negative denotes downward movement in price
* Neutral denotes sideways (steady) movement in price
* None denotes that no assessment about prices can be made from the news headline

### Let us ignore the news headlines that do not have any price movement information in it, i.e. drop rows with "Price Sentiment" as 'none'

In [3]:
df = df[df["Price Sentiment"] != 'none']

In [4]:
print("Commodity News Headlines")
display(df[["News","Price Sentiment"]])

Commodity News Headlines


Unnamed: 0,News,Price Sentiment
0,"april gold down 20 cents to settle at $1,116.1...",negative
1,gold suffers third straight daily decline,negative
2,Gold futures edge up after two-session decline,positive
4,"Gold snaps three-day rally as Trump, lawmakers...",negative
5,"Dec. gold climbs $9.40, or 0.7%, to settle at ...",positive
...,...,...
10565,gold seen falling from 3-week high this week,negative
10566,dominic frisby : now looks like a good time to...,positive
10567,Gold heading for worst week since November on ...,negative
10568,august gold up $7.60 at $878.80 an ounce on nymex,positive


### The following piece of code is used to clean the headlines

In [5]:
def cleaner(impure_data):
    temp_list = []
    for item in impure_data:
        #finding words which start with @
        item = re.sub('@\S+', '', item)
        
        #finding words which start with http
        item = re.sub('http\S+\s*', '', item)
        
        #finding special characters, but not "emoji"
        item = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', item)
        temp_list.append(item)
    return temp_list

### Let us create a simple SVM model with tfidf vectorizer

In [6]:
def headline_sentiment(df):
    headlines = df["News"]
    polarity = df["Price Sentiment"].tolist()
    
    #cleaning headlines i.e. removing @mentions, http(s) links and special characters such as punctuations
    clean_headline = cleaner(headlines)
    
    #initializing tf-idf vectorizer
    tf_idfvectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
    
    #splitting the data into train and test dataset in 70 : 30 ratio at random
    X_train, X_test, Y_train, Y_test = train_test_split(clean_headline, polarity, test_size = 0.3)
    
    train_corpus_tf_idf = tf_idfvectorizer.fit_transform(X_train) 
    test_corpus_tf_idf = tf_idfvectorizer.transform(X_test)
    
    #using SVC package to initialize a classifier with Linear kernel and other default parameters
    SVM_L = SVC(kernel= 'linear')
    
    #fitting the sparse matrix in the classifier with their respective sentiments
    SVM_L.fit(train_corpus_tf_idf, Y_train)
    
    #predicting the sentiments for the test dataset
    Y_pred = SVM_L.predict(test_corpus_tf_idf)
    
    #this prints accuracy score for the test dataset
    print("Testing Accuracy:",accuracy_score(Y_test,Y_pred))
    
    #this prints confusion matrix for the test dataset
    labels = np.unique(Y_test)    
    m = confusion_matrix(Y_test,Y_pred, labels=labels)
    print("\nConfusion matrix on test data")
    cm = pd.DataFrame(m, index=labels, columns=labels)
    cm.index = "Actual: " + cm.index
    cm.columns = "Predicted: " + cm.columns
    display(cm)
    
    #saving the data into a csv file in the current folder
    temp_df = pd.DataFrame()
    temp_df["News"] = X_test
    temp_df["Actual Price Sentiment"] = Y_test
    temp_df["Predicted Sentiment"] = Y_pred
    temp_df.to_csv("predicted.csv")
    
    print('Predictions on Test Data are as follows:')
    display(temp_df)

    return(tf_idfvectorizer,SVM_L)

In [7]:
vectorizer,model = headline_sentiment(df)

Testing Accuracy: 0.9186361875242154

Confusion matrix on test data


Unnamed: 0,Predicted: negative,Predicted: neutral,Predicted: positive
Actual: negative,1053,10,95
Actual: neutral,13,72,21
Actual: positive,61,10,1246


Predictions on Test Data are as follows:


Unnamed: 0,News,Actual Price Sentiment,Predicted Sentiment
0,dec gold closes at 410oz up 5 for the session,positive,positive
1,dec gold ends down 1550 or 21 at 73850 an ounce,negative,negative
2,buy gold on every dip kaushal jaini,negative,negative
3,Dec gold settles at 120980oz on Comex up 110 o...,positive,positive
4,Gold prices down further on unexpected drop in...,negative,negative
...,...,...,...
2576,gold futures end lower as oil tumbles dollar g...,negative,negative
2577,Gold futures pare losses slightly after durabl...,negative,negative
2578,gold settles 02 lower at 124810 an ounce,negative,negative
2579,Gold firms up by Rs 125 per 10 gm silver remai...,positive,neutral


### Important
* Looking at the confusion matrix, it  is clear that the performance on neutral will be poor. 
* Positive and negative headlines are likely to be identified correctly

In [8]:
#Trying sample headlines
vector = vectorizer.transform([""])
sentiment = model.predict(vector)
print(sentiment)

['positive']


In [9]:
#Trying sample headlines
vector = vectorizer.transform(["The price of gold continues declining."])
sentiment = model.predict(vector)
print(sentiment)

['negative']


In [10]:
#Trying sample headlines
vector = vectorizer.transform([" gold in good demand ahead of FOMC."])
sentiment = model.predict(vector)
print(sentiment)

['positive']


In [11]:
#Trying sample headlines
vector = vectorizer.transform(["Gold price expected to remain steady."])
sentiment = model.predict(vector)
print(sentiment)

['neutral']


### Try sentence transformers to get extraordinary improvement in results