In [1]:
import pandas as pd
import os
from datetime import datetime
import json
import re

In [2]:
def remove_urls (vTEXT):
    results = re.compile(r'http://[a-zA-Z0-9.?/&=:]*', re.S)
    dd = results.sub("", vTEXT)
    return dd

def remove_urls_https (vTEXT):
    results = re.compile(r'https://[a-zA-Z0-9.?/&=:]*', re.S)
    dd = results.sub("", vTEXT)
    return dd

def remove_incomplete_urls (vTEXT):
    vTEXT = re.sub(r'http://…', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def remove_double_line (vTEXT):
    vTEXT = re.sub(r'--', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def remove_RT_at (vTEXT):
    vTEXT = re.sub(r'(RT @)(.*)(\:)', "", vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def remove_via_at (vTEXT):
    vTEXT = re.sub(r'(via @)(.*)(\.)', "", vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def remove_Ellipsis (vTEXT):
    vTEXT = re.sub(r'  …', "", vTEXT, flags=re.MULTILINE)
    return(vTEXT)

In [3]:
# const
DATA_PATH = "../data"
PRICE_PATH = DATA_PATH + "/price/"
TWEET_PATH = DATA_PATH + "/tweet/"
TSV_PATH = DATA_PATH + "/tsv/"

In [4]:
# input
stock_name = input("Input the stock name of data: ")
delay = int(input("Input delay (number of day): "))

Input the stock name of data: V
Input delay (number of day): 1


In [5]:
# read price csv data
price_df = pd.read_csv(PRICE_PATH + stock_name + ".csv")
price_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2012-09-04,32.09,32.287498,31.905001,32.130001,29.126745,10200800
1,2012-09-05,32.215,32.25,31.737499,31.9025,28.920511,12698800
2,2012-09-06,32.044998,32.375,31.950001,32.375,29.34885,15698400
3,2012-09-07,32.377499,32.490002,32.1675,32.427502,29.396441,10789200
4,2012-09-10,32.105,32.470001,32.105,32.169998,29.163008,11633600


In [6]:
# add label column to price_df
price_df['Label'] = price_df.apply(lambda row: 1 if row.Close - row.Open >= 0 else 0, axis = 1)
price_df.index = pd.to_datetime(price_df['Date'], format = '%Y-%m-%d')
price_df['Date'] = pd.to_datetime(price_df['Date'], format = '%Y-%m-%d')
price_df.head()

Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-09-04,2012-09-04,32.09,32.287498,31.905001,32.130001,29.126745,10200800,1
2012-09-05,2012-09-05,32.215,32.25,31.737499,31.9025,28.920511,12698800,0
2012-09-06,2012-09-06,32.044998,32.375,31.950001,32.375,29.34885,15698400,1
2012-09-07,2012-09-07,32.377499,32.490002,32.1675,32.427502,29.396441,10789200,1
2012-09-10,2012-09-10,32.105,32.470001,32.105,32.169998,29.163008,11633600,1


In [7]:
# Stage 1: convert eact tweet file with correct stock date (map off market day to market day)

# path name
dir_str = TWEET_PATH + stock_name
dir_obj = os.fsencode(TWEET_PATH + stock_name)

# dataframe
tweet_df = pd.DataFrame(columns=['date', 'id', 'label', 'alpha', 'text'])
current_date = None
index = 0

for i, filename_byte in enumerate(os.listdir(dir_obj)):
    filename = os.fsdecode(filename_byte)
    date = datetime.strptime(filename, '%Y-%m-%d')
    
    # find nearest market day after current_date in price_df
    temp_df = price_df.loc[price_df.index >= date]
    current_date = temp_df.index[temp_df.index.get_loc(current_date, method='nearest')]
    
    with open(dir_str + "/" + filename, 'r') as file:
        for line in file:
            json_str = json.loads(line)
            temp_text=json_str['text']
            temp_text = remove_urls(temp_text)
            temp_text = remove_urls_https(temp_text)
            temp_text = remove_incomplete_urls(temp_text)
            temp_text = remove_RT_at(temp_text)
            temp_text = remove_via_at(temp_text)
            temp_text=  remove_double_line(temp_text)
            temp_text=  remove_Ellipsis(temp_text)
            json_str['text']=temp_text
            tweet_df.loc[index] = {'date': current_date, 'id': 0, 'label': 0, 'alpha': 'a', 'text': json_str['text']}
            index += 1
            
    if i % 100 == 0:
        print("Finished processing %d file" % i)

Finished processing 0 file
Finished processing 100 file
Finished processing 200 file
Finished processing 300 file


KeyboardInterrupt: 

In [None]:
# replace id column with index
tweet_df['id'] = tweet_df.index
tweet_df.head(10)

In [None]:
# Stage 2: add postive / negative label to tweet_df by price number
p_set = set(price_df['Date'].tolist())
t_set = set(tweet_df['date'].tolist())
market_days = list(p_set.intersection(t_set))
market_days.sort()

for i, day in enumerate(market_days):
    
    # end the loop if delay is over
    if i + delay > len(market_days) - 1:
        break
        
    delay_day = market_days[i + delay]
    label = price_df.loc[price_df.index == delay_day, 'Label'].values[0]
    tweet_df.loc[tweet_df['date'] == day, 'label'] = label

In [None]:
tweet_df[tweet_df['label'] == 1].head()

In [None]:
# Output to tsv format
tweet_df.to_csv(TSV_PATH + "v.tsv", sep=',', index=False)