In [1]:
# parts of this code has been referenced from https://blog.quantinsti.com/vader-sentiment/
#import relevant libraries

# nltk for baseline unsupervised sentiment detection using vader lexicon
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# pandas and numpy for dealing with csv and numerical operations
import pandas as pd
import numpy as np

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/geshwar/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train['diff'] = train['Open'] - train['Close']
test['diff'] = test['Open'] - test['Close']


In [3]:
from sklearn.metrics import roc_curve, roc_auc_score

def get_labels(val):
    if val > 0 :
        return 1
    else:
        return 0

def get_data(train_file, test_file, cols):
    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)
    train['diff'] = train['Open'] - train['Close']
    test['diff'] = test['Open'] - test['Close']
    train['label'] = train['diff'].apply(get_labels)
    test['label'] = train['diff'].apply(get_labels)
    
    train = get_vader_score(train, cols)
    test = get_vader_score(test, cols)
    return train, test

def get_vader_score(df, cols):
    # vader sentiment detector
    analyzer = SentimentIntensityAnalyzer()
    scores = []
    
    df['text_final'] = df[cols].apply(lambda row: '. '.join(row.values.astype(str)), axis=1).tolist()
    for te in range(len(df)):
        scores.append(analyzer.polarity_scores(df['text_final'].iloc[te])['compound'])

    df['compound_vader_score'] = scores
    return df

from sklearn.metrics import accuracy_score
def generate_roc_score(df, df_col, prob_col):
    li = list(df[df_col])
    fpr, tpr, thresholds = roc_curve(li, df[prob_col])
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

def compute_accuracy(df, df_col, prob_col, threshold):
    li = list(df[df_col])
    preds = []
    for val in list(df[prob_col]):
        if val < threshold:
            preds.append(0)
        else:
            preds.append(1)
    return accuracy_score(li, preds)



    

In [4]:
#using only title of news
train, test = get_data('train.csv', 'test.csv', ['title'])
optimal_threshold = generate_roc_score(train, 'label', 'compound_vader_score')
print("Optimal threshold ", optimal_threshold)
train_accuracy = compute_accuracy(train, 'label', 'compound_vader_score', optimal_threshold)
test_accuracy = compute_accuracy(test, 'label', 'compound_vader_score', optimal_threshold)
print("Train and test accuracy", train_accuracy, test_accuracy)

Optimal threshold  0.0258
Train and test accuracy 0.5115446779034865 0.5269607843137255


In [5]:
#using only title and description of news
train, test = get_data('train.csv', 'test.csv', ['title', 'description'])
optimal_threshold = generate_roc_score(train, 'label', 'compound_vader_score')
print("Optimal threshold ", optimal_threshold)
train_accuracy = compute_accuracy(train, 'label', 'compound_vader_score', optimal_threshold)
test_accuracy = compute_accuracy(test, 'label', 'compound_vader_score', optimal_threshold)
print("Train and test accuracy", train_accuracy, test_accuracy)

Optimal threshold  0.5709
Train and test accuracy 0.5055414453936735 0.528921568627451
