# Stockwatch

In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gc
import re

In [137]:
df_md = pd.read_csv('for_cloud.csv')
df_stockwatch = pd.read_csv('merged_data_stockwatch.csv')

In [138]:
df_md['shifted_close'] = df_md.groupby('ticker')['Close'].shift(-1)
df_md['tgt'] = (df_md.shifted_close - df_md.Close).div(df_md.shifted_close) * 100
df_md['target'] = 0
df_md.loc[(df_md['tgt'] >= 0), 'target'] = 1
df_md.loc[(df_md['tgt'] < 0), 'target'] = 0
df_md.drop([ 'Sector', 'shifted_close', 'tgt'], axis = 1 , inplace = True)

In [139]:
df_stockwatch.head()

Unnamed: 0,Date,ticker,title
0,2018-12-19 06:30,MMM,3M to Acquire M*Modal ’s Technology Business
1,2018-12-05 02:00,MMM,3M Announces Formal Requirement at COP24 for A...
2,2018-11-20 13:32,MMM,3M Announces Upcoming Investor Events
3,2018-11-15 13:56,MMM,3M Responding to California Wildfires with N95...
4,2018-11-15 06:30,MMM,"3M CEO Introduces Priorities for the Future, S..."


In [140]:
df_stockwatch['Date'] = pd.to_datetime(df_stockwatch['Date'])
df_stockwatch['Date'] = df_stockwatch['Date'].dt.date
df_md['Date'] = pd.to_datetime(df_md['Date'])
df_md['Date'] = df_md['Date'].dt.date

In [141]:
res = pd.merge(df_md, df_stockwatch, on = ['ticker', 'Date'], how = 'left')

In [142]:
res['title'].isna().sum()

1047086

In [143]:
res.head()

Unnamed: 0,ticker,Date,Adj Close,Close,High,Low,Open,Volume,Name,target,title
0,AAL,2008-12-31,7.405228,7.73,7.87,7.48,7.48,4194100.0,American Airlines Group,1,
1,AAL,2009-01-02,8.037499,8.39,8.48,7.67,7.73,5167000.0,American Airlines Group,0,
2,AAL,2009-01-05,7.980019,8.33,8.39,7.96,8.38,3457100.0,American Airlines Group,1,
3,AAL,2009-01-06,8.679349,9.06,9.21,8.13,8.15,5731000.0,American Airlines Group,1,
4,AAL,2009-01-07,8.698509,9.08,9.47,8.66,8.66,5468900.0,American Airlines Group,1,


In [144]:
stockwatch_not_null = res[res['title'].notnull()]
fcls = ['index', 'label','alpha','title']

In [145]:
stockwatch_not_null = stockwatch_not_null.loc[:, ['title', 'target']]
stockwatch_not_null['index'] = stockwatch_not_null.index
stockwatch_not_null['alpha'] = 'a'
stockwatch_not_null.rename(index = str, columns = {'target': 'label'}, inplace = True)
stockwatch_not_null = stockwatch_not_null[fcls]

In [146]:
stockwatch_not_null.shape

(259928, 4)

In [147]:
stockwatch_not_null.shape[0] * 0.05

12996.400000000001

In [148]:
stockwatch_not_null.head()

Unnamed: 0,index,label,alpha,title
119,119,0,a,Anglo American's Coal Division in South Africa...
137,137,0,a,Anglo American Introduces Sir John Parker as C...
146,146,0,a,Anglo American CEO Interviewed on 2009 Interim...
285,285,1,a,Anglo American Announces Operating Profit Of $...
397,397,0,a,Anglo American CEO Cynthia Carroll Interviewed...


In [149]:
X_train, X_test, y_train, y_test = train_test_split(stockwatch_not_null, stockwatch_not_null['label'], test_size = 0.99, random_state= 1337, stratify = stockwatch_not_null['label']  )
X_train, X_test, y_train, y_test = train_test_split(X_test, X_test['label'], test_size = 0.1, random_state= 1337, stratify = X_test['label'])

#here I need final X_train, X_test(val)

In [152]:
stockwatch_not_null.drop(['label', 'alpha'], axis = 1, inplace = True)

In [153]:
X_train.to_csv('data_stockwatch_final_preds/train.tsv', sep='\t', index=False, header=False)
X_test.to_csv('data_stockwatch_final_preds/dev.tsv', sep='\t', index=False, header=False)
stockwatch_not_null.to_csv('data_stockwatch_final_preds/test.tsv', sep='\t', index=False, header=True)

# Reuters

In [15]:
import pandas as pd
df_md = pd.read_csv('for_cloud.csv')
#df_reut = pd.read_csv('full_sentiment_news-1201.csv')
df_reut = pd.read_csv('own_data_test_sentiments.csv')

tickers = pd.read_csv('../constituents_csv.csv')
tickers = tickers.rename(index = str, columns = {'Symbol': 'ticker'})

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
df_md['shifted_close'] = df_md.groupby('ticker')['Close'].shift(-1)
df_md['tgt'] = (df_md.shifted_close - df_md.Close).div(df_md.shifted_close) * 100
df_md['target'] = 0
df_md.loc[(df_md['tgt'] >= 0), 'target'] = 1
df_md.loc[(df_md['tgt'] < 0), 'target'] = 0
df_md.drop([ 'Sector', 'shifted_close', 'tgt'], axis = 1 , inplace = True)

In [17]:
df_reut.head()

Unnamed: 0,Date,Sname,title,Name
0,2009-01-05,American Airlines,American Airlines Reports December Traffic,American Airlines Group
1,2009-01-06,American Airlines,Allied Pilots Association Critical of American...,American Airlines Group
2,2009-01-12,American Airlines,American Airlines Starts the New Year Right Wi...,American Airlines Group
3,2009-01-15,American Airlines,"American Airlines Launches Newly Redesigned, M...",American Airlines Group
4,2009-01-16,American Airlines,American Airlines and Association of Professio...,American Airlines Group


In [18]:
data = pd.merge(df_reut, tickers, how = 'left', on = 'Name')

In [19]:
df_reut['Date'] = pd.to_datetime(df_reut['Date'])
df_reut['Date'] = df_reut['Date'].dt.date
df_md['Date'] = pd.to_datetime(df_md['Date'])
df_md['Date'] = df_md['Date'].dt.date

In [20]:
res1 = pd.merge(df_md, df_reut, on = ['Name', 'Date'], how = 'left')

In [21]:
res1['title'].isna().sum()

994813

In [22]:
print(df_reut.Date.min(), df_reut.Date.max())

2008-12-31 2018-10-09


In [23]:
print(df_md.Date.min(), df_md.Date.max())

2008-12-31 2018-11-09


In [24]:
res1.head()

Unnamed: 0,ticker,Date,Adj Close,Close,High,Low,Open,Volume,Name,target,Sname,title
0,AAL,2008-12-31,7.405228,7.73,7.87,7.48,7.48,4194100.0,American Airlines Group,1,,
1,AAL,2009-01-02,8.037499,8.39,8.48,7.67,7.73,5167000.0,American Airlines Group,0,,
2,AAL,2009-01-05,7.980019,8.33,8.39,7.96,8.38,3457100.0,American Airlines Group,1,American Airlines,American Airlines Reports December Traffic
3,AAL,2009-01-06,8.679349,9.06,9.21,8.13,8.15,5731000.0,American Airlines Group,1,American Airlines,Allied Pilots Association Critical of American...
4,AAL,2009-01-07,8.698509,9.08,9.47,8.66,8.66,5468900.0,American Airlines Group,1,,


In [25]:
#rnt.columns

In [26]:
rnt = res1[res1['title'].notnull()]

rnt_test = pd.DataFrame(rnt.groupby(['ticker', 'Date'])['title'].agg(lambda col: ' '.join(col)))
rnt_test = rnt_test.reset_index()

In [27]:
to_join = pd.DataFrame(rnt.groupby(['ticker', 'Date'])['target'].max())
to_join = to_join.reset_index()

In [28]:
merged = pd.merge(rnt_test, to_join, on = ['ticker', 'Date'], how = 'left')

In [29]:
merged = merged.reset_index()

In [30]:
merged['alpha'] = 'a'

In [31]:
fcls = ['index', 'label','alpha','title']

In [32]:
merged = merged.loc[:, ['title', 'target']]
merged['index'] = merged.index
merged['alpha'] = 'a'
merged.rename(index = str, columns = {'target': 'label'}, inplace = True)
merged = merged[fcls]

In [33]:
merged.head()

Unnamed: 0,index,label,alpha,title
0,0,1,a,Agilent Technologies Announces Upcoming Webcas...
1,1,0,a,TRADE NEWS: Agilent Technologies Adds Custom m...
2,2,1,a,TRADE NEWS: Agilent Technologies` New Solder P...
3,3,0,a,Hydrogen Adds Agilent Technologies` Electronic...
4,4,0,a,TRADE NEWS: Agilent Technologies Delivers Indu...


In [34]:
merged['label'].describe()

count    259607.000000
mean          0.499840
std           0.500001
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: label, dtype: float64

In [132]:
merged.shape[0]*0.004 * 0.3

311.52840000000003

In [133]:
X_train, X_test, y_train, y_test = train_test_split(merged, merged['label'], test_size = 0.004, random_state= 1337, stratify = merged['label']  )
X_train, X_test, y_train, y_test = train_test_split(X_test, X_test['label'], test_size = 0.3, random_state= 1337, stratify = X_test['label'])

#here I need final X_train, X_test(val)

In [134]:
merged.drop(['label', 'alpha'], axis = 1, inplace = True)

In [135]:
merged.shape

(259607, 2)

In [136]:
first_sub, second_sub, y_train, y_test = train_test_split(merged, merged['title'], test_size = 0.25, random_state= 1337)


In [137]:
def cleanText(text):
        
    text = text.replace('\\n','')
    text = text.replace('\\','')
    #text = re.sub('\[(.*?)\]','',text) #removes [this one]
    text = re.sub('(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?\s',
                ' __url__ ',text) #remove urls
    #text = re.sub('\'','',text)
    #text = re.sub(r'\d+', ' __number__ ', text) #replaces numbers
    #text = re.sub('\W', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.replace('\t', '')
    text = text.replace('\n', '')
    return text

In [138]:
X_train['title'] = X_train['title'].apply(cleanText)
X_test['title'] = X_test['title'].apply(cleanText)
merged['title'] = merged['title'].apply(cleanText)

first_sub['title'] = first_sub['title'].apply(cleanText)
second_sub['title'] = second_sub['title'].apply(cleanText)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [139]:
X_train.to_csv('data_reuters_final_preds/train.tsv', sep='\t', index=False, header=False)
X_test.to_csv('data_reuters_final_preds/dev.tsv', sep='\t', index=False, header=False)
merged.to_csv('data_reuters_final_preds/test_full.tsv', sep='\t', index=False, header=True)
first_sub.to_csv('data_reuters_final_preds/first_test.tsv', sep='\t', index=False, header=True)
second_sub.to_csv('data_reuters_final_preds/second_test.tsv', sep='\t', index=False, header=True)

In [34]:
df_results = pd.read_csv("data_reuters_final_preds/first_test.tsv",sep="\t",header=0)
df_results1 = pd.read_csv("data_reuters_final_preds/second_test.tsv",sep="\t",header=0)

In [43]:
df_results['length'] = df_results['title'].str.len()

In [45]:
df_results['length'].max()

34888

# Reuters + Stockwatch. Sentiments

In [36]:
import pandas as pd
import numpy as np

In [37]:
df_stockwatch = pd.read_csv('merged_data_stockwatch.csv')
df_reut = pd.read_csv('own_data_test_sentiments.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
df_stockwatch.head()

Unnamed: 0,Date,ticker,title
0,2018-12-19 06:30,MMM,3M to Acquire M*Modal ’s Technology Business
1,2018-12-05 02:00,MMM,3M Announces Formal Requirement at COP24 for A...
2,2018-11-20 13:32,MMM,3M Announces Upcoming Investor Events
3,2018-11-15 13:56,MMM,3M Responding to California Wildfires with N95...
4,2018-11-15 06:30,MMM,"3M CEO Introduces Priorities for the Future, S..."


In [39]:
df_reut.head()

Unnamed: 0,Date,Sname,title,Name
0,2009-01-05,American Airlines,American Airlines Reports December Traffic,American Airlines Group
1,2009-01-06,American Airlines,Allied Pilots Association Critical of American...,American Airlines Group
2,2009-01-12,American Airlines,American Airlines Starts the New Year Right Wi...,American Airlines Group
3,2009-01-15,American Airlines,"American Airlines Launches Newly Redesigned, M...",American Airlines Group
4,2009-01-16,American Airlines,American Airlines and Association of Professio...,American Airlines Group


In [40]:
df_stockwatch.shape[0]

279876

In [41]:
df_reut.shape[0]

887405

In [42]:
df_stockwatch.drop(['Date', 'ticker'], axis = 1, inplace = True)
df_reut.drop(['Date', 'Sname', 'Name'], axis = 1, inplace = True)

df_general = pd.concat([df_reut, df_stockwatch], axis = 0) 
df_general.reset_index(inplace = True) #old indices: 887405, 279876
df_general['index'] = df_general.index

In [43]:
df_general.shape[0]* 0.003

3501.843

In [148]:
df_mod_test = df_general.sample(500)

In [149]:
df_mod_test.to_csv('data_sentiments/test_model.tsv', sep='\t', index=False, header=True)

In [150]:
df_general.to_csv('data_sentiments/test.tsv', sep='\t', index=False, header=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167281 entries, 0 to 1167280
Data columns (total 2 columns):
index    1167281 non-null int64
title    1167281 non-null object
dtypes: int64(1), object(1)
memory usage: 17.8+ MB


# Preparing train from kaggle data

In [151]:
df = pd.read_csv('kaggle_train_sentiments.csv')
gc.collect()

41

In [152]:
df.head()

Unnamed: 0,headline,sentimentClass
0,China's Daqing pumps 43.41 mln tonnes of oil i...,-1
1,"FEATURE-In kidnapping, finesse works best",-1
2,PRESS DIGEST - Wall Street Journal - Jan 1,-1
3,PRESS DIGEST - New York Times - Jan 1,-1
4,PRESS DIGEST - New York Times - Jan 1,-1


In [153]:
lst_digest = []
for j in range(df.shape[0]):
    if 'DIGEST' not in str(df.iloc[j, 0]) and 'BUZZ-U.S.' not in str(df.iloc[j, 0]) and 'FACTBOX-U.S.' not in str(df.iloc[j, 0]):
        lst_digest.append(j)

In [154]:
len(lst_digest)

9177473

In [155]:
df = df.iloc[lst_digest, :]
df['index'] = df.index
df['alpha'] = 'a'

df.rename(index = str, columns = {'headline': 'title', 'sentimentClass': 'label'}, inplace= True)
df.columns
fcls = ['index', 'label','alpha','title']

df = df[fcls]

In [156]:
df.head()

Unnamed: 0,index,label,alpha,title
0,0,-1,a,China's Daqing pumps 43.41 mln tonnes of oil i...
1,1,-1,a,"FEATURE-In kidnapping, finesse works best"
9,9,1,a,Tenet Completes Sale of Alvarado Hospital Medi...
10,10,0,a,RPT-Wall St Week Ahead: Mild jobs may lift sto...
11,11,0,a,RPT-Wall St Week Ahead: Mild jobs may lift sto...


In [157]:
df['label'] = df['label'] + 1

In [158]:
df.shape[0]*0.03 * 0.07 *0.25

4818.173325000001

In [159]:
X_train, X_test, y_train, y_test = train_test_split(df, df['label'], test_size = 0.03*0.07, random_state= 1337, stratify = df['label'])
X_train, X_test, y_train, y_test = train_test_split(X_test, X_test['label'], test_size = 0.25, random_state= 1337, stratify = X_test['label'])

X_train.dropna(axis = 0, inplace = True)
X_test.dropna(axis = 0, inplace = True)

In [160]:
X_train.to_csv('data_sentiments/train.tsv', sep='\t', index=False, header=False) #~15000
X_test.to_csv('data_sentiments/dev.tsv', sep='\t', index=False, header=False)  #4818

In [161]:
gc.collect()

137

In [179]:
X_test['title_len'] = X_test['title'].str.len()

(14454, 4)

# Check

In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gc
import re
from sklearn.metrics import accuracy_score

In [131]:
label_stockwatch = stockwatch_not_null['label']
#label_reuters = merged['label']

In [132]:
pred_stockwatch = pd.read_csv('outputs/tr0705_stockwatch.tsv' ,sep="\t",header=None)
pred_stockwatch = pd.DataFrame({'index':pred_stockwatch.index,
                               'label':pred_stockwatch.idxmax(axis=1)})

# pred_reuters = pd.read_csv('outputs/test_results_reuters.tsv', sep = "\t", header= None)
# pred_reuters = pd.DataFrame({'index':pred_reuters.index,
#                                'label':pred_reuters.idxmax(axis=1)})

In [133]:
accuracy_score(label_stockwatch, pred_stockwatch['label'])

0.4772475454741313

Unnamed: 0,index,label
count,259928.0,259928.0
mean,129963.5,0.032009
std,75034.894722,0.176024
min,0.0,0.0
25%,64981.75,0.0
50%,129963.5,0.0
75%,194945.25,0.0
max,259927.0,1.0
