In [None]:
import re
import pandas as pd
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from nltk.corpus import stopwords

In [None]:
def readCSV(filename):
    """
    reads a csv file and returns a clean pandas dataframe
    """
    df = pd.read_csv(filename, encoding='latin-1')
    df2 = df.loc[df['newsType'] == 'topStory']
    
    def getCorrectDate(wrongdate):
        wrongdate = str(wrongdate)
        year, month, day = wrongdate[:4], wrongdate[4:6], wrongdate[6:]
        return month + '/' + day + '/' + year

    df2['date'] = df2['date'].apply(lambda x: getCorrectDate(x))
    df2['date'] = pd.to_datetime(df2['date'])
    
    df2['text'] = df2['headline'] + df2['body']
    
    return df2

In [None]:
def cleanText(text):
    """
    removes punctuation, stopwords and returns lowercase text in a list of single words
    """
    text = text.lower()    
    
    text = tokenizer.tokenize(text)
    
    clean = [word for word in text if word not in stopwords.words('english')]
    
    return clean

In [None]:
def loadPositive():
    """
    loading positive dictionary
    """
    myfile = open('./preprocessing/LoughranMcDonald_Positive.txt', "r")
    positives = myfile.readlines()
    positive = [pos.strip().lower() for pos in positives]
    return positive

def loadNegative():
    """
    loading positive dictionary
    """
    myfile = open('./preprocessing/LoughranMcDonald_Negative.txt', "r")
    negatives = myfile.readlines()
    negative = [neg.strip().lower() for neg in negatives]
    return negative

In [None]:
def countNeg(cleantext, negative):
    """
    counts negative words in cleantext
    """
    negs = [word for word in cleantext if word in negative]
    return len(negs)

def countPos(cleantext, positive):
    """
    counts negative words in cleantext
    """
    pos = [word for word in cleantext if word in positive]
    return len(pos)

In [None]:
def getSentiment(cleantext, negative, positive):
    """
    counts negative and positive words in cleantext and returns a score category accordingly
    """
    positive = loadPositive()
    negative = loadNegative()
    sentiment = countPos(cleantext, positive) - countNeg(cleantext, negative)
    if sentiment > 3:
        return 4
    elif sentiment > 0 and sentiment <=3:
        return 3
    elif sentiment < 0 and sentiment > -3:
        return 2
    elif sentiment <= -3:
        return 1
    else:
        return 0

In [None]:
def updateSentimentDataFrame(df):
    """
    performs sentiment analysis on single text entry of dataframe and returns dataframe with scores
    """
    positive = loadPositive()
    negative = loadNegative()   
    
    df['text'] = df['text'].apply(cleanText)
    df['score'] = df['text'].apply(lambda x: getSentiment(x,negative, positive))
    
    return df

In [None]:
df = readCSV('./input/news_reuters.csv')
print('Performing Sentiment...')
updateSentimentDataFrame(df).to_csv('./preprocessing/sentiments.csv')