In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env();

In [None]:
(market_train_data, news_train_data) = env.get_training_data()

In [None]:
news_train_data.head()

In [None]:
news_train_data['headlineTag'].unique().size

In [None]:
market_train_data = market_train_data.loc[market_train_data['time'] > '2012-01-01 22:00:00+0000']
news_train_data = news_train_data.loc[news_train_data['time'] > '2012-01-01 22:00:00+0000']

In [None]:
# For market data, drop the erreous data
market_train_data['ratio'] = market_train_data['close'] / market_train_data['open']
for i, row in market_train_data.loc[market_train_data['ratio'] >= 2 ].iterrows():
    market_train_data.drop(i)
for i, row in market_train_data.loc[market_train_data['ratio'] <= 0.5 ].iterrows():
    market_train_data.drop(i)
market_train_data.drop(columns = 'ratio', inplace = True)

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain

%matplotlib inline

In [None]:
news_cols_agg = {
    'urgency': ['max','min','std'],
    'bodySize': ['max','min','std'],
    'marketCommentary': ['max','min'],
    'wordCount': ['max', 'min', 'std'],
    'sentenceCount': ['max', 'min', 'std'],
    'companyCount': ['max', 'min', 'std'],
    'relevance': ['max','min','std'],
    'sentimentNegative': ['max','min','std'],
    'sentimentNeutral': ['max','min','std'],
    'sentimentPositive': ['max','min','std'],
    'sentimentWordCount': ['max','min','std'],
    'noveltyCount12H':['max','min','std'],
    'noveltyCount24H': ['max','min','std'],
    'noveltyCount3D': ['max','min','std'],
    'noveltyCount5D': ['max','min','std'],
    'noveltyCount7D': ['max','min','std'],
    'volumeCounts12H': ['max','min','std'],
    'volumeCounts24H': ['max','min','std'],
    'volumeCounts3D': ['max','min','std'],
    'volumeCounts5D': ['max','min','std'],
    'volumeCounts7D': ['max','min','std']
}

In [None]:
def combine_market_news(market_train_df, news_train_df):

    # Create expandaded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetName'] + list(news_cols_agg.keys())
    news_train_df_expanded = news_train_df[news_cols]
    # Free memory

    # Aggregate numerical news features
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetName']).agg(news_cols_agg)
    # Free memory
    del news_train_df_expanded
    
    # Join with train
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]
    market_train_df = market_train_df.join(news_train_df_aggregated, on=['time', 'assetName'])
    # Free memory
    del news_train_df_aggregated
    
    return market_train_df

In [None]:
def get_xy(market_train_df, news_train_df):
    x = get_x(market_train_df, news_train_df)
    y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
    y = y.values
    return x, y

def get_x(market_train_df, news_train_df):
    # Split date into before and after 22h (the time used in train data)
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

    # Round time of market_train_df to 0h of curret day
    market_train_df['time'] = market_train_df['time'].dt.floor('1D')

    # Join market and news
    x = combine_market_news(market_train_df, news_train_df)
    
    # If not label-encoder... encode assetCode
    try:
        x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        x.drop(columns=['universe'], inplace=True)
    except:
        pass
    x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
    x.drop(columns='time', inplace=True)
    x['marketCommentary_max'] = x['marketCommentary_max'].astype(float)
    x['marketCommentary_min'] = x['marketCommentary_min'].astype(float)
    return x

In [None]:
# get the training value and the label
X, y = get_xy(market_train_data, news_train_data)
return_Next10 = y
y = y>=0

In [None]:
universe = market_train_data['universe']
time = market_train_data['time']
del market_train_data, news_train_data

In [None]:
def feature_normalize(train_data):
    means = np.mean(train_data, axis = 0)
    stds = np.std(train_data, axis = 0)
    print(means)
    result = ((train_data-means)/stds)
    return result

In [None]:
# For training data
X.drop(columns = ['assetCode','assetName'],inplace = True)
X = feature_normalize(X)
train_num = int(X.shape[0] * 0.8)
X_train, y_train = X[:train_num], y[:train_num]
# For valid data
X_valid, y_valid , return_Next10 = X[train_num:], y[train_num:],return_Next10[train_num:]
universe_valid = (universe[train_num:] > 0)
time_valid = time[train_num:]
X_valid = X_valid[universe_valid]
y_valid = y_valid[universe_valid]
return_Next10 = return_Next10[universe_valid]
time_valid = time_valid[universe_valid]
del universe_valid, universe,time

In [None]:
X = X.fillna(0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
model = LogisticRegression(C = 1).fit(X, y)

In [None]:
def make_predictions(predictions_template_df, market_obs_df, news_obs_df):
    DataSet= get_x(market_obs_df, news_obs_df)
    DataSet.drop(columns = ['assetCode','assetName'],inplace = True)
    DataSet = feature_normalize(DataSet)
    DataSet = DataSet.fillna(0)
    predictions_template_df.confidenceValue = 1 - (model.predict_proba(DataSet)*2)

In [None]:
days = env.get_prediction_days()

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(predictions_template_df, market_obs_df, news_obs_df)
    env.predict(predictions_template_df)
print('Done!')

In [None]:
env.write_submission_file()