In [54]:
# Imports + data path definitions

import os
import pandas as pd

# required packages: scikit, nltk, pandas

# / root
#   / data
#     / stopwords.txt
#     / stock_corpus.csv
#     / stockreddit
#       / djia
#       / scraped
#     / stocknews
#     / stockprices
#       / ETFs
#       / Stocks
#     / stocktweets
#       / preprocessed
#       / processed
#   / notebook

DATA_PATH = r'../data'

print(os.getcwd())
print(os.path.realpath(DATA_PATH))

REDDIT  = os.path.join(DATA_PATH, 'stockreddit')
NEWS    = os.path.join(DATA_PATH, 'stocknews')
PRICES  = os.path.join(DATA_PATH, 'stockprices')
TWITTER = os.path.join(DATA_PATH, 'stocktweets')

/Users/calvin/git/cpsc571/notebook
/Users/calvin/git/cpsc571/data


In [60]:
# Combine all the daily tweets into one file per stock index

indices = ['AAPL', 'AMZN', 'FB']
output = os.path.join(TWITTER, 'processed')

for index in indices:
    path = os.path.join(TWITTER, 'preprocessed', index)
    dfs = []
    for file in os.listdir(path):
        dfs.append(pd.read_json(os.path.join(path, file), orient='records', lines=True))
    df = pd.concat(dfs, axis=0)
    df.to_json(os.path.join(output, f'{index}.json'), orient='records', lines=True)

In [129]:
# Read all different data sources + header standardization

reddit = pd.read_csv(os.path.join(REDDIT, 'scraped', 'amazon_reddit.csv'))
reddit.rename({'Title': 'text', 'Publish Date': 'date'}, axis=1, inplace=True)
reddit['date'] = pd.to_datetime(reddit['date'])
reddit['source'] = pd.Series('reddit', reddit.index)

news = pd.read_csv(os.path.join(NEWS, 'Amazon.csv'))
news.rename({'newsHeadline': 'text', 'start_time_stamp': 'date'}, axis=1, inplace=True)
news['date'] = pd.to_datetime(news['date'])
news['source'] = pd.Series('news', news.index)

tweets = pd.read_json(os.path.join(TWITTER, 'processed', 'AMZN.json'), lines=True)
tweets.rename({'created_at': 'date', 'user_id_str': 'user_id'}, axis=1, inplace=True)
tweets['text'] = tweets['text'].apply(lambda x: ' '.join(x))
tweets['source'] = pd.Series('twitter', tweets.index)

In [134]:
# Clean tweet text before joining with Reddit and News

import re

stopwords = open(os.path.join(DATA_PATH, 'stopwords.txt'), 'r') \
                .read() \
                .split('\n')
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\s\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^rt[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    return tweet

tweets['text'] = tweets['text'].apply(clean_tweets)

In [147]:
# Final stock corpus

corpus = pd.concat([reddit, news, tweets], axis=0, join='inner') \
            .sort_values(by='date') \
            .reset_index(drop=True)

corpus.to_csv(os.path.join(DATA_PATH, 'stock_corpus.csv'), index=False)

In [159]:
# http://blog.chapagain.com.np/python-nltk-twitter-sentiment-analysis-natural-language-processing-nlp/

from nltk.corpus import twitter_samples

pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets)) # Output: 5000
 
neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets)) # Output: 5000

5000
5000


In [161]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [164]:

import string
import re
 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean

In [166]:

# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
 
# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
 
print (len(pos_tweets_set), len(neg_tweets_set)) # Output: (5000, 5000)

5000 5000


In [168]:
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
 
test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
 
print(len(test_set),  len(train_set)) # Output: (2000, 8000)

2000 8000
