In [11]:
import json
import pandas as pd
from datetime import datetime, timezone
import os
import pickle
import time

In [12]:
import json

config = json.load(open("../configure.json"))
pkls_dir = config['pkls_dir']
start_date = datetime.fromisoformat(config['start_time']).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
end_date = datetime.fromisoformat(config['end_time']).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
news_data_pkl_path = config["news_data_pkls"]

In [13]:
news_path = config['news_dir']
ignore_files = ['.DS_Store']
news_folders = os.listdir(news_path)
article_content = []
published_time = []
time_frames = []
form = "%d-%m-%Y %H:%M"
for sub_folder in news_folders:
    if sub_folder in ignore_files:
        continue
    articles = os.listdir(news_path + "/" + sub_folder)

    for article in articles:
        apple_sentiment = 0
        amazon_sentiment = 0
        article_path = news_path + "/" + sub_folder + "/" + article
        article_json = open(article_path)
        data = json.load(article_json)
        orgs = data['entities']['organizations']
        time_of_publish = data['published']
        time = datetime.fromisoformat(time_of_publish).astimezone(tz=timezone.utc).replace(tzinfo=None).timestamp()
        published_time.append(time)
        article_content.append(data['text'].lower())
        time_frame = datetime.fromisoformat(time_of_publish).strftime(form)
        time_frames.append(time_frame)

raw_data = pd.DataFrame({
    "published_time": published_time,
    "time_frames": time_frames,
    'article_content': article_content,
})
raw_data.drop(raw_data[(raw_data['published_time'] < start_date) | (raw_data['published_time'] > end_date)].index,
              inplace=True)

In [14]:
apple_news_data = raw_data[raw_data['article_content'].str.contains('apple|aapl')]
amazon_news_data = raw_data[raw_data['article_content'].str.contains("amazon|amzn")]
amazon_news_data = amazon_news_data[amazon_news_data['article_content'].map(lambda x: x.isascii())]
apple_news_data = apple_news_data[apple_news_data['article_content'].map(lambda x: x.isascii())]
amazon_news_data.drop_duplicates(inplace=True)
apple_news_data.drop_duplicates(inplace=True)
amazon_news_data.sort_values('published_time', ascending=True, inplace=True)
apple_news_data.sort_values('published_time', ascending=True, inplace=True)

In [15]:
import time
import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import words

amazon_news_data['word_tokens'] = amazon_news_data['article_content'].apply(lambda x: word_tokenize(x))
apple_news_data['word_tokens'] = apple_news_data['article_content'].apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to /Users/k2rth1k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
from nltk import SnowballStemmer

steps = 0
from nltk.corpus import stopwords, words

nltk.download('words')
nltk.download('stopwords')
import string

lower_words = set([w.lower() for w in words.words()])
stops = set(stopwords.words('english'))
l_stops = len(stops)


def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)


def word_clean(x, steps=None):
    global lower_words
    global stops
    res = [w for w in x if w.isascii()]
    res = [w.translate(str.maketrans(dict.fromkeys(string.punctuation))) for w in res if w]
    res = [w for w in res if
           not ((has_numbers(w)) or (w.startswith("www")) or (w.startswith("http")) or (w.isnumeric()) or (
                   w in stops))]
    res = [w for w in res if w in lower_words]
    res = [w for w in res if len(w) > 0]
    stemmer = SnowballStemmer('english')
    res = [stemmer.stem(w) for w in res if w]
    return res

[nltk_data] Downloading package words to /Users/k2rth1k/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/k2rth1k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
amazon_news_data['word_tokens'] = amazon_news_data['word_tokens'].apply(lambda x: word_clean(x, 0))
apple_news_data['word_tokens'] = apple_news_data['word_tokens'].apply(lambda x: word_clean(x, 0))

In [18]:
with open(news_data_pkl_path + 'amazon_news_data.pkl', 'wb') as f:
    pickle.dump(amazon_news_data, f)
with open(news_data_pkl_path + 'apple_news_data.pkl', 'wb') as f:
    pickle.dump(apple_news_data, f)