In [1]:
import json
import pandas as pd
from datetime import datetime, timezone
import os
import pickle
import time

In [2]:
start_date = datetime.fromisoformat("2018-01-01 00:00").astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
end_date = datetime.fromisoformat("2018-12-31 23:59").astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()

In [3]:
news_path = "/Users/k2rth1k/swm/News"
ignore_files = ['.DS_Store']
news_folders = os.listdir(news_path)
article_content = []
published_time = []
for sub_folder in news_folders:
    if sub_folder in ignore_files:
        continue
    articles = os.listdir(news_path + "/" + sub_folder)

    for article in articles:
        apple_sentiment = 0
        amazon_sentiment = 0
        article_path = news_path + "/" + sub_folder + "/" + article
        article_json = open(article_path)
        data = json.load(article_json)
        orgs = data['entities']['organizations']
        time_of_publish = data['published']
        time = datetime.fromisoformat(time_of_publish).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
        published_time.append(time)
        article_content.append(data['text'].lower())

raw_data = pd.DataFrame({
    "published_time": published_time,
    'article_content': article_content,
})
raw_data.drop(raw_data[(raw_data['published_time'] < start_date) | (raw_data['published_time'] > end_date)].index,
              inplace=True)

In [4]:
apple_news_data = raw_data[raw_data['article_content'].str.contains('apple|aapl')]
amazon_news_data = raw_data[raw_data['article_content'].str.contains("amazon|amzn")]
amazon_news_data = amazon_news_data[amazon_news_data['article_content'].map(lambda x: x.isascii())]
apple_news_data = apple_news_data[apple_news_data['article_content'].map(lambda x: x.isascii())]
amazon_news_data.drop_duplicates(inplace=True)
apple_news_data.drop_duplicates(inplace=True)
amazon_news_data.sort_values('published_time', ascending=True, inplace=True)
apple_news_data.sort_values('published_time', ascending=True, inplace=True)

In [5]:
amazon_news_data


Unnamed: 0,published_time,article_content
12886,1.514876e+09,here are some things going on today in your wo...
13571,1.514876e+09,shutterstock photo\nstocks indexes opened the ...
11837,1.514876e+09,by ryan vlastelica\nto simply match the market...
13804,1.514876e+09,what happened shares of many optical networkin...
15737,1.514876e+09,by nigam arora\nthe practical way to take adva...
...,...,...
1348,1.546281e+09,the biggest benefit from living are the lesson...
2553,1.546290e+09,apple stock price\napple (nasdaq: aapl ) was v...
849,1.546306e+09,none of us has a crystal ball that shows us cl...
56164,1.546321e+09,"let's talk about apple, inc. (nasdaq: aapl ). ..."


In [6]:
with open('pkls/amazon_news_data.pkl', 'wb') as f:
    pickle.dump(amazon_news_data, f)

with open('pkls/apple_news_data.pkl', 'wb') as f:
    pickle.dump(apple_news_data, f)

In [7]:
import time
import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import words

begin = time.time()
amazon_news_data['word_tokens'] = amazon_news_data['article_content'].apply(lambda x: word_tokenize(x))
apple_news_data['word_tokens'] = apple_news_data['article_content'].apply(lambda x: word_tokenize(x))
end = time.time()
print("time taken: ", end - begin)

[nltk_data] Downloading package punkt to /Users/k2rth1k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


time taken:  150.05701279640198


In [8]:
apple_news_data

Unnamed: 0,published_time,article_content,word_tokens
16174,1.514832e+09,facebook \nciti analysts say that there is a 4...,"[facebook, citi, analysts, say, that, there, i..."
15612,1.514855e+09,"quote: in this context avxl is a new apple, im...","[quote, :, in, this, context, avxl, is, a, new..."
12200,1.514862e+09,2 warren buffett stocks to consider buying now...,"[2, warren, buffett, stocks, to, consider, buy..."
15928,1.514862e+09,2 warren buffett stocks to consider buying now...,"[2, warren, buffett, stocks, to, consider, buy..."
14085,1.514862e+09,2 warren buffett stocks to consider buying now...,"[2, warren, buffett, stocks, to, consider, buy..."
...,...,...,...
56164,1.546321e+09,"let's talk about apple, inc. (nasdaq: aapl ). ...","[let, 's, talk, about, apple, ,, inc., (, nasd..."
61168,1.546322e+09,big trends: gartner iaas mq is down to 6 compa...,"[big, trends, :, gartner, iaas, mq, is, down, ..."
41,1.546326e+09,- fundamental analysis - technology is apple s...,"[-, fundamental, analysis, -, technology, is, ..."
60869,1.546332e+09,"new york, oct. 23, 2018 (globe newswire) -- in...","[new, york, ,, oct., 23, ,, 2018, (, globe, ne..."


In [9]:
from nltk import PorterStemmer, SnowballStemmer

steps = 0
from nltk.corpus import stopwords, words

nltk.download('words')
nltk.download('stopwords')
import string

lower_words = set([w.lower() for w in words.words()])
stops = set(stopwords.words('english'))
l_stops = len(stops)

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def word_clean(x, steps=None):
    global lower_words
    global stops
    res = [w for w in x if w.isascii()]
    res = [w.translate(str.maketrans(dict.fromkeys(string.punctuation))) for w in res if w]
    res = [w for w in res if not ((has_numbers(w))or (w.startswith("www")) or (w.startswith("http")) or (w.isnumeric()) or (
            w in stops))]
    res = [w for w in res if w in lower_words]
    res = [w for w in res if len(w) > 0]
    stemmer = SnowballStemmer('english')
    res = [stemmer.stem(w) for w in res if w]
    return res

[nltk_data] Downloading package words to /Users/k2rth1k/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/k2rth1k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

amazon_news_data
res = amazon_news_data['word_tokens'][12886]
print(len(res))
begin = time.time()
res = [w for w in res if w.isascii()]
res = [w.translate(str.maketrans(dict.fromkeys(string.punctuation))) for w in res if w]
res = [w for w in res if not ((w.startswith("www")) or (w.startswith("http")) or (w.isnumeric()) or (w.alpanumeric()) or (
        w in stops))]
res = [w for w in res if w in lower_words]
res = [w for w in res if len(w) > 0]
stemmer = SnowballStemmer('english')
res = [stemmer.stem(w) for w in res if w]
print(len(res))
end = time.time()
print("time taken to execute: ", end - begin)

In [10]:
begin = time.time()
amazon_news_data['word_tokens'] = amazon_news_data['word_tokens'].apply(lambda x: word_clean(x, 0))
end = time.time()
print("time taken to execute: ", end - begin)

time taken to execute:  31.21256709098816


In [11]:
begin = time.time()
amazon_news_data.to_csv('pkls/amazon_tokens.csv')
end = time.time()
print("time taken to execute: ", end - begin)
#%

time taken to execute:  0.8714330196380615


In [12]:
begin = time.time()
apple_news_data['word_tokens'] = apple_news_data['word_tokens'].apply(lambda x: word_clean(x, 0))
end = time.time()
print("time taken to execute: ", end - begin)


time taken to execute:  74.62903618812561


In [13]:
begin = time.time()
amazon_news_data.to_csv('pkls/apple_tokens.csv')
end = time.time()
print("time taken to execute: ", end - begin)

time taken to execute:  0.8947939872741699
