In [1]:
import json
import pandas as pd
from datetime import datetime, timezone
import os
import pickle

In [2]:
start_date = datetime.fromisoformat("2018-01-01 00:00").astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
end_date = datetime.fromisoformat("2018-12-31 23:59").astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()

In [3]:
news_path = "/Users/k2rth1k/swm/News"
ignore_files = ['.DS_Store']
news_folders = os.listdir(news_path)
article_content = []
published_time = []
for sub_folder in news_folders:
    if sub_folder in ignore_files:
        continue
    articles = os.listdir(news_path + "/" + sub_folder)

    for article in articles:
        apple_sentiment = 0
        amazon_sentiment = 0
        article_path = news_path + "/" + sub_folder + "/" + article
        article_json = open(article_path)
        data = json.load(article_json)
        orgs = data['entities']['organizations']
        time_of_publish = data['published']
        time = datetime.fromisoformat(time_of_publish).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
        published_time.append(time)
        article_content.append(data['text'].lower())

raw_data = pd.DataFrame({
    "published_time": published_time,
    'article_content': article_content,
})
raw_data.drop(raw_data[(raw_data['published_time'] < start_date) | (raw_data['published_time'] > end_date)].index,
              inplace=True)

In [4]:
apple_news_data = raw_data[raw_data['article_content'].str.contains('apple|aapl')]
amazon_news_data = raw_data[raw_data['article_content'].str.contains("amazon|amzn")]
amazon_news_data = amazon_news_data[amazon_news_data['article_content'].map(lambda x: x.isascii())]
apple_news_data = apple_news_data[apple_news_data['article_content'].map(lambda x: x.isascii())]
amazon_news_data.drop_duplicates(inplace=True)
apple_news_data.drop_duplicates(inplace=True)
amazon_news_data.sort_values('published_time', ascending=True, inplace=True)
apple_news_data.sort_values('published_time', ascending=True, inplace=True)

In [5]:
apple_news_data

Unnamed: 0,published_time,article_content
16174,1.514832e+09,facebook \nciti analysts say that there is a 4...
15612,1.514855e+09,"quote: in this context avxl is a new apple, im..."
12200,1.514862e+09,2 warren buffett stocks to consider buying now...
15928,1.514862e+09,2 warren buffett stocks to consider buying now...
14085,1.514862e+09,2 warren buffett stocks to consider buying now...
...,...,...
56164,1.546321e+09,"let's talk about apple, inc. (nasdaq: aapl ). ..."
61168,1.546322e+09,big trends: gartner iaas mq is down to 6 compa...
41,1.546326e+09,- fundamental analysis - technology is apple s...
60869,1.546332e+09,"new york, oct. 23, 2018 (globe newswire) -- in..."


In [6]:
print(len(amazon_news_data))

5677


In [7]:
with open('pkls/amazon_news_data.pkl', 'wb') as f:
    pickle.dump(amazon_news_data, f)

with open('pkls/apple_news_data.pkl', 'wb') as f:
    pickle.dump(apple_news_data, f)

In [8]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import words
amazon_news_data['word_tokens'] = amazon_news_data['article_content'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to /Users/k2rth1k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
from nltk import PorterStemmer

steps=0
from nltk.corpus import stopwords, words
nltk.download('words')
nltk.download('stopwords')
import string
import re
from nltk.stem import SnowballStemmer
def word_clean(x, steps=None):
    res = [w for w in x if w.isascii()]
    res = [w.translate(str.maketrans(dict.fromkeys(string.punctuation))) for w in res if w]
    # res = [w for w in res if not re.match('http',w)]
    res = [w for w in res if w not in set(stopwords.words('english'))]
    # res = [w for w in res if w in set([w.lower() for w in words.words()])]
    res = [w for w in res if len(w)>0]
    stemmer = PorterStemmer()
    res = [stemmer.stem(w) for w in res if w]
    steps+=1
    print(steps)
    return res

[nltk_data] Downloading package words to /Users/k2rth1k/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/k2rth1k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
amazon_news_data

Unnamed: 0,published_time,article_content,word_tokens
12886,1.514876e+09,here are some things going on today in your wo...,"[here, are, some, things, going, on, today, in..."
13571,1.514876e+09,shutterstock photo\nstocks indexes opened the ...,"[shutterstock, photo, stocks, indexes, opened,..."
11837,1.514876e+09,by ryan vlastelica\nto simply match the market...,"[by, ryan, vlastelica, to, simply, match, the,..."
13804,1.514876e+09,what happened shares of many optical networkin...,"[what, happened, shares, of, many, optical, ne..."
15737,1.514876e+09,by nigam arora\nthe practical way to take adva...,"[by, nigam, arora, the, practical, way, to, ta..."
...,...,...,...
1348,1.546281e+09,the biggest benefit from living are the lesson...,"[the, biggest, benefit, from, living, are, the..."
2553,1.546290e+09,apple stock price\napple (nasdaq: aapl ) was v...,"[apple, stock, price, apple, (, nasdaq, :, aap..."
849,1.546306e+09,none of us has a crystal ball that shows us cl...,"[none, of, us, has, a, crystal, ball, that, sh..."
56164,1.546321e+09,"let's talk about apple, inc. (nasdaq: aapl ). ...","[let, 's, talk, about, apple, ,, inc., (, nasd..."


In [None]:
amazon_news_data['word_tokens'] = amazon_news_data['word_tokens'].apply(lambda x: word_clean(x,0))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


KeyboardInterrupt: 