In [93]:
import json
import pandas as pd
from datetime import datetime, timezone
import os
import pickle
import time

In [94]:
import json

config = json.load(open("../configure.json"))
pkls_dir = config['pkls_dir']
start_date = datetime.fromisoformat(config['start_time']).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
end_date = datetime.fromisoformat(config['end_time']).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()


In [95]:
news_path = config['news_dir']
ignore_files = ['.DS_Store']
news_folders = os.listdir(news_path)
article_content = []
published_time = []
for sub_folder in news_folders:
    if sub_folder in ignore_files:
        continue
    articles = os.listdir(news_path + "/" + sub_folder)

    for article in articles:
        apple_sentiment = 0
        amazon_sentiment = 0
        article_path = news_path + "/" + sub_folder + "/" + article
        article_json = open(article_path)
        data = json.load(article_json)
        orgs = data['entities']['organizations']
        time_of_publish = data['published']
        time = datetime.fromisoformat(time_of_publish).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
        published_time.append(time)
        article_content.append(data['text'].lower())

raw_data = pd.DataFrame({
    "published_time": published_time,
    'article_content': article_content,
})
raw_data.drop(raw_data[(raw_data['published_time'] < start_date) | (raw_data['published_time'] > end_date)].index,
              inplace=True)

In [96]:
apple_news_data = raw_data[raw_data['article_content'].str.contains('apple|aapl')]
amazon_news_data = raw_data[raw_data['article_content'].str.contains("amazon|amzn")]
amazon_news_data = amazon_news_data[amazon_news_data['article_content'].map(lambda x: x.isascii())]
apple_news_data = apple_news_data[apple_news_data['article_content'].map(lambda x: x.isascii())]
amazon_news_data.drop_duplicates(inplace=True)
apple_news_data.drop_duplicates(inplace=True)
amazon_news_data.sort_values('published_time', ascending=True, inplace=True)
apple_news_data.sort_values('published_time', ascending=True, inplace=True)

In [97]:
import time
import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import words

amazon_news_data['word_tokens'] = amazon_news_data['article_content'].apply(lambda x: word_tokenize(x))
apple_news_data['word_tokens'] = apple_news_data['article_content'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to /Users/k2rth1k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [74]:
from nltk import PorterStemmer, SnowballStemmer

steps = 0
from nltk.corpus import stopwords, words

nltk.download('words')
nltk.download('stopwords')
import string

lower_words = set([w.lower() for w in words.words()])
stops = set(stopwords.words('english'))
l_stops = len(stops)


def word_clean(x, steps=None):
    global lower_words
    global stops
    res = [w for w in x if w.isascii()]
    res = [w.translate(str.maketrans(dict.fromkeys(string.punctuation))) for w in res if w]
    res = [w for w in res if
           not ((has_numbers(w)) or (w.startswith("www")) or (w.startswith("http")) or (w.isnumeric()) or (
                   w in stops))]
    res = [w for w in res if w in lower_words]
    res = [w for w in res if len(w) > 0]
    stemmer = SnowballStemmer('english')
    res = [stemmer.stem(w) for w in res if w]
    return res

[nltk_data] Downloading package words to /Users/k2rth1k/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/k2rth1k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
amazon_news_data['word_tokens'] = amazon_news_data['word_tokens'].apply(lambda x: word_clean(x, 0))

In [76]:
amazon_news_data.head()

Unnamed: 0,published_time,article_content,word_tokens
12869,1514876000.0,by nigam arora\nthe practical way to take adva...,"[practic, way, take, advantag, januari, effect..."
12886,1514876000.0,here are some things going on today in your wo...,"[go, today, world, tech, bellweth, tech, march..."
13804,1514876000.0,what happened shares of many optical networkin...,"[mani, optic, decemb, accord, data, p, global,..."
11106,1514876000.0,"by ciara linnane and tomi kilgore, marketwatch...","[gain, vener, blue, chip, ge, stock, appl, par..."
11837,1514876000.0,by ryan vlastelica\nto simply match the market...,"[simpli, match, market, may, need, take, lot, ..."


In [77]:
corpus = []
for index, row in amazon_news_data.iterrows():
    corpus.append(' '.join(row['word_tokens']))

Completed 12869 rows for amazon
Completed 12886 rows for amazon
Completed 13804 rows for amazon
Completed 11106 rows for amazon
Completed 11837 rows for amazon
Completed 15737 rows for amazon
Completed 15841 rows for amazon
Completed 15252 rows for amazon
Completed 13571 rows for amazon
Completed 13476 rows for amazon
Completed 12853 rows for amazon
Completed 16334 rows for amazon
Completed 12597 rows for amazon
Completed 16481 rows for amazon
Completed 11922 rows for amazon
Completed 15040 rows for amazon
Completed 13149 rows for amazon
Completed 13064 rows for amazon
Completed 14666 rows for amazon
Completed 11511 rows for amazon
Completed 15262 rows for amazon
Completed 14153 rows for amazon
Completed 11368 rows for amazon
Completed 14848 rows for amazon
Completed 11742 rows for amazon
Completed 16537 rows for amazon
Completed 14681 rows for amazon
Completed 14179 rows for amazon
Completed 13097 rows for amazon
Completed 15209 rows for amazon
Completed 16351 rows for amazon
Complete

In [78]:
import pickle
import nltk
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("popular")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/k2rth1k/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/k2rth1k/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/k2rth1k/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/k2rth1k/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/k2rth1k/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/k2rth1k/nltk_data...
[nltk_data]    |   Package movie_reviews is a

In [79]:
one_gram_vectorizer = CountVectorizer(max_df=0.9, min_df=0.01, stop_words=stop_words, ngram_range=(2, 2))
amzn_1gm_features = one_gram_vectorizer.fit_transform(corpus)


In [80]:
amzn_1gm_features_array = amzn_1gm_features.toarray()
len(amzn_1gm_features_array)

1115

In [81]:
amazon_news_data['features'] = amzn_1gm_features_array.tolist()

In [82]:
amazon_news_data.head()

Unnamed: 0,published_time,article_content,word_tokens,features
12869,1514876000.0,by nigam arora\nthe practical way to take adva...,"[practic, way, take, advantag, januari, effect...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12886,1514876000.0,here are some things going on today in your wo...,"[go, today, world, tech, bellweth, tech, march...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13804,1514876000.0,what happened shares of many optical networkin...,"[mani, optic, decemb, accord, data, p, global,...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11106,1514876000.0,"by ciara linnane and tomi kilgore, marketwatch...","[gain, vener, blue, chip, ge, stock, appl, par...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11837,1514876000.0,by ryan vlastelica\nto simply match the market...,"[simpli, match, market, may, need, take, lot, ...","[0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."


In [83]:
with open(pkls_dir + 'amazon_news_2gram.pkl', 'wb') as f:
    pickle.dump(amazon_news_data, f)

In [84]:

apple_news_data['word_tokens'] = apple_news_data['word_tokens'].apply(lambda x: word_clean(x, 0))

In [85]:
apple_news_data.head()

Unnamed: 0,published_time,article_content,word_tokens
16174,1514832000.0,facebook \nciti analysts say that there is a 4...,"[say, likelihood, appl, appl, abl, repatri, bi..."
15612,1514855000.0,"quote: in this context avxl is a new apple, im...","[quot, context, new, appl, continu, analog, pe..."
15928,1514862000.0,2 warren buffett stocks to consider buying now...,"[warren, stock, consid, john, motley, fool, tw..."
14085,1514862000.0,2 warren buffett stocks to consider buying now...,"[warren, stock, consid, john, save, invest, be..."
12200,1514862000.0,2 warren buffett stocks to consider buying now...,"[warren, stock, consid, januari, motley, fool,..."


In [86]:
corpus = []
for index, row in apple_news_data.iterrows():
    corpus.append(' '.join(row['word_tokens']))

In [87]:
aapl_1gm_features = one_gram_vectorizer.fit_transform(corpus)
aapl_1gm_features_array = aapl_1gm_features.toarray()
apple_news_data['features'] = aapl_1gm_features_array.tolist()

In [88]:
with open(pkls_dir + 'apple_news_2gram.pkl', 'wb') as f:
    pickle.dump(amazon_news_data, f)