In [20]:
import re
import gensim
import pandas as pd
import pickle
import numpy as np
import nltk
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from nltk.corpus import stopwords

In [2]:
file = "glove.6B.50d.txt"
glove_vector_name = "gensim_glove_vectors.txt"
news_file = "News.csv"
price_file = "DowJones.csv"
embedding_dim = 50

In [5]:
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

In [6]:
def transfer_model(input,output):
    """
    transfer txt into model with glove lib
    :param file: file path of the glove txt file
    :return: None
    """
    glove2word2vec(glove_input_file=input, word2vec_output_file=output)


In [7]:
def load_glove_model(input):
    """
    get the glove model
    :param filename: the file name
    :return: glove model
    """
    return KeyedVectors.load_word2vec_format(input, binary=False)


In [8]:
def read_news():
    """
    :return: both of the news and stock price
    """
    dj = pd.read_csv(price_file)
    news = pd.read_csv(news_file)
    return dj,news


In [24]:
def clean_text(text, remove_stopwords=True):
    '''Remove unwanted characters and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'0,0', '00', text)
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'u s ', ' united states ', text)
    text = re.sub(r'u n ', ' united nations ', text)
    text = re.sub(r'u k ', ' united kingdom ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    text = re.sub(r'0km ', '0 km ', text)

    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()

#         nltk.download('stopwords')
#         print(stopwords)
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [28]:
def get_data_from_csv():
    
    dj,news = read_news()
    
    #search the null length
    # print(dj.isnull().sum())
    # print(news.isnull().sum())

    #looke the shape
    # print(dj.shape)
    # print(news.shape)

    # Compare the number of unique dates. We want matching values.
    print(len(set(dj.Date)))
    print(len(set(news.Date)))

    # Remove the extra dates that are in news
    news = news[news.Date.isin(dj.Date)]

    #make sure the equal
    print(len(set(dj.Date)))
    print(len(set(news.Date)))

    #use the date as index to get the different each day
    dj = dj.set_index('Date').diff(periods=1)

    #print(dj.head())

    #add the date back
    dj['Date'] = dj.index

    dj = dj.reset_index(drop=True)
    # Remove unneeded features
    dj = dj.drop(['High','Low','Close','Volume','Adj Close'], 1)
    #print(dj.head())

    #remove the first row
    dj = dj[dj.Open.notnull()]

    #two array to save the trend(label) and news(input_string)
    prices_trend = []
    headlines = []
    for row in dj.iterrows():
        # print(row)
        # print('row0',row[0])
        daily_headlines = []
        # print('row1',row[1])
        date = row[1]['Date']
        prices_trend.append(1 if row[1]['Open']>= 0 else -1)
        for row_ in news[news.Date == date].iterrows():
            daily_headlines.append(row_[1]['News'])
        headlines.append(daily_headlines)

    return prices_trend,headlines

In [32]:
def clean_new(headlines):
    clean_headlines = []

    for daily_headlines in headlines:
        clean_daily_headlines = []
        for headline in daily_headlines:
            clean_daily_headlines.append(clean_text(headline))
        clean_headlines.append(clean_daily_headlines)
    return clean_headlines

In [15]:
def dailynews_to_vector(news_corpos):
    """
    :return: daily news numpy array
    """

    news_vectors = np.empty(len(news_corpos))
    for news in news_corpos:
        ls = news.split()
        length = len(ls)
        single_news_vec = np.zeros(50)
        for word in ls:
#             print(word)
            try:
                single_news_vec += np.asarray(model[word])
            except KeyError:
#                 print('No word')
                single_news_vec += np.random.uniform(-1.0, 1.0, embedding_dim)
    
        single_news_vec /= length
#         print(single_news_vec)
        news_vectors.append(single_news_vec)
    return news_vectors
    

In [None]:
def mean_pooling(dailynews):
    

In [None]:
def news_embedding(headlines):
    headlines_vector = []
    for dailynews in headlines: 
        day_news = dailynews_to_vector(dailynews)
        
        
        headlines_vector.append()
        
    return headlines_vector

In [None]:
def creat_dataset(prices_trend,headlines,time_step):
    x,y,y_seq = [],[],[]
    for i in range(len(prices_trend) - time_step - 1):
        last = i + time_step
        x.append(headlines[i:last])
        y.append(prices_trend[last])
        y_seq.append(prices_trend[i:last])
    return np.array(x),np.array(y),np.array(y.seq)
    

In [36]:
prices_trend,headlines = get_data_from_csv()
# print(headlines[0])
clean_headlines = clean_new(headlines)

# print(clean_headlines[0])
print(prices_trend)


#clean_headlines is the set of dates news
# print(clean_headlines[0])
# news1 = clean_headlines[0]
# f = open('news1.txt','wb')
# pickle.dump(news1,f)

# f = open('news1.txt','rb')
# anews1 = pickle.load(f)

# print(anews1)

1989
2943
1989
1989
[-1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, 1, 1, -1,