In [1]:
# import standard libraries
from datetime import timedelta
import numpy as np
# import third-party libraries
import gensim
from gensim import corpora
import pandas as pd
# import local libraries

# Feature engineering

In [2]:
# load patent data
patents = pd.read_csv('data/patents/clean/patents.csv.gz', compression='gzip')
patents['text'] = patents['text'].apply(lambda x: x.split(' '))

In [3]:
# load returns
stocks = pd.read_csv('data/returns/clean/adj_stock_returns.csv', index_col=0)
stocks.index = pd.to_datetime(stocks.index)

In [4]:
# load lda model
num_topics = 100
ldamodel = gensim.models.ldamodel.LdaModel.load(f'data/nlp/model{num_topics}.gensim')

In [5]:
patents.columns

Index(['app_date', 'app_number', 'assignee_organization',
       'cited_patent_number', 'citedby_patent_number', 'cpc_group_title',
       'cpc_subgroup_title', 'cpc_subsection_title', 'num_inventor',
       'nber_subcategory_title', 'patent_abstract', 'patent_date',
       'patent_num_claims', 'patent_number', 'patent_processing_time',
       'patent_title', 'uspc_mainclass_title', 'uspc_subclass_title',
       'wipo_field_title', 'wipo_sector_title', 'ticker', 'text'],
      dtype='object')

In [6]:
# remove columns from patents
keep_col = ['app_date', 'patent_date', 'app_number', 'cited_patent_number',
           'num_inventor', 'patent_num_claims', 'ticker', 'text']

In [7]:
X = patents[keep_col]

In [8]:
X.loc[:, 'app_date'] = pd.to_datetime(X.loc[:, 'app_date'])
X.loc[:, 'patent_date'] = pd.to_datetime(X.loc[:, 'patent_date'])
X.sort_values(by='app_date', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
# create feature with number of patent applications filed in the last 60 days.
delay = 60 # days
X['num_app_prior'] = np.nan
for ticker, x in X.groupby('ticker'):
    for i, patent in x.iterrows():
        app_date_delayed = patent['app_date'] - timedelta(days=delay)
        num_app_prior = len(x[(x['app_date'] >= app_date_delayed) & (x['app_date'] < patent['app_date'])])
        X.at[i, 'num_app_prior'] = num_app_prior

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# create feature with topic extraction from lda modela
X[[f'topic{t}' for t in range(num_topics)]] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[k] = np.nan


In [11]:
# create corpora and dictionary
dictionary = corpora.Dictionary(list(patents['text'].values))

In [12]:
for i, x in X.iterrows():
    new_doc_bow = dictionary.doc2bow(x['text'])
    t_prob = ldamodel.get_document_topics(new_doc_bow, minimum_probability=0.0)
    for t in t_prob:
        X.at[i, 'topic'+str(t[0])] = t[1]

In [13]:
# drop columns that are not needed
X = X.drop(columns=['text'])

In [14]:
# create to X matrices, one with application dates and another with granted dates
X_app = X.drop(columns=['patent_date']).rename(columns={'app_date': 'date'})
X_patent = X.drop(columns=['app_date']).rename(columns={'patent_date': 'date'})

# Generate labels y

In [15]:
# generate labels y
days = 7

In [16]:
# remove all dates that are on 2010-01-04 (starting day of stocks) since we need to get 1 day of returns
# prior to date
X_app = X_app[X_app['date'] > '2010-01-04']
X_patent = X_patent[X_patent['date'] > '2010-01-04']

In [17]:
# define y label inside X (in order to keep track of indices and other metadata)
X_app['y_ret'] = np.nan
X_patent['y_ret'] = np.nan
X_app['y_bin'] = np.nan
X_patent['y_bin'] = np.nan

In [18]:
for i, x in X_app.iterrows():
    # get returns one day before date
    sub = stocks[x['ticker']]
    ret_before = sub[sub.index < x['date']].iloc[-1:]
    ret_after = sub[sub.index >= x['date']].iloc[:days-1]
    ret = pd.concat([ret_before, ret_after], axis=0)
    ret = (ret + 1).cumprod(axis=0).iloc[-1]-1
    X_app.at[i, 'y_ret'] = ret
    X_app.at[i, 'y_bin'] = 0 if ret < 0 else 1

In [19]:
for i, x in X_patent.iterrows():
    # get returns one day before date
    sub = stocks[x['ticker']]
    ret_before = sub[sub.index < x['date']].iloc[-1:]
    ret_after = sub[sub.index >= x['date']].iloc[:days-1]
    ret = pd.concat([ret_before, ret_after], axis=0)
    ret = (ret + 1).cumprod(axis=0).iloc[-1]-1
    X_patent.at[i, 'y_ret'] = ret
    X_patent.at[i, 'y_bin'] = 0 if ret < 0 else 1

In [20]:
X_app

Unnamed: 0,date,app_number,cited_patent_number,num_inventor,patent_num_claims,ticker,num_app_prior,topic0,topic1,topic2,...,topic92,topic93,topic94,topic95,topic96,topic97,topic98,topic99,y_ret,y_bin
8624,2010-01-05,12652152,8.0,4.0,7.0,VRTX,1.0,0.000175,0.000175,0.000175,...,0.076860,0.000175,0.000175,0.038320,0.000175,0.000175,0.000175,0.000175,-0.042213,0.0
1472,2010-01-06,12652837,37.0,5.0,3.0,LLY,0.0,0.000250,0.067045,0.036304,...,0.000250,0.000250,0.000250,0.000250,0.000250,0.000250,0.000250,0.000250,-0.004620,0.0
3903,2010-01-06,12683317,1.0,7.0,5.0,NVS,0.0,0.274955,0.000667,0.133980,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.117122,0.001442,1.0
3924,2010-01-06,12683191,103.0,4.0,54.0,NVS,0.0,0.000238,0.000238,0.000238,...,0.000238,0.000238,0.000238,0.000238,0.058978,0.000238,0.000238,0.000238,0.001442,1.0
5146,2010-01-07,12683791,1.0,9.0,6.0,BMY,0.0,0.000500,0.000500,0.000500,...,0.000500,0.000500,0.000500,0.000500,0.000500,0.000500,0.000500,0.000500,-0.012810,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8309,2019-03-18,16356895,4.0,5.0,19.0,LLY,1.0,0.000625,0.000625,0.000625,...,0.000625,0.063122,0.000625,0.000625,0.000625,0.000625,0.000625,0.000625,0.057051,1.0
8003,2019-03-28,16367778,1.0,5.0,1.0,JNJ,1.0,0.000303,0.122921,0.000303,...,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,-0.035293,0.0
7467,2019-04-04,16375047,1.0,6.0,10.0,GSK,0.0,0.001250,0.001250,0.001250,...,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,-0.025408,0.0
5060,2019-04-15,16384333,61.0,5.0,18.0,BIIB,0.0,0.000417,0.000417,0.042078,...,0.000417,0.000417,0.000417,0.000417,0.000417,0.000417,0.000417,0.000417,-0.028185,0.0


In [21]:
X_patent

Unnamed: 0,date,app_number,cited_patent_number,num_inventor,patent_num_claims,ticker,num_app_prior,topic0,topic1,topic2,...,topic92,topic93,topic94,topic95,topic96,topic97,topic98,topic99,y_ret,y_bin
4170,2013-01-29,12651654,6.0,2.0,18.0,MRK,0.0,0.000370,0.000370,0.000370,...,0.000370,0.000370,0.000370,0.000370,0.000370,0.000370,0.000370,0.000370,-0.050917,0.0
8690,2013-09-03,12651639,73.0,12.0,9.0,VRTX,0.0,0.043535,0.000222,0.000222,...,0.000222,0.000222,0.000222,0.000222,0.025563,0.000222,0.000222,0.000222,-0.006481,0.0
1052,2017-02-07,12651782,13.0,3.0,25.0,NVO,0.0,0.000833,0.000833,0.000833,...,0.000833,0.000833,0.000833,0.084172,0.000833,0.000833,0.000833,0.000833,-0.003373,0.0
8624,2011-02-08,12652152,8.0,4.0,7.0,VRTX,1.0,0.000175,0.000175,0.000175,...,0.076860,0.000175,0.000175,0.038320,0.000175,0.000175,0.000175,0.000175,-0.011648,0.0
1472,2014-03-18,12652837,37.0,5.0,3.0,LLY,0.0,0.000250,0.067045,0.036304,...,0.000250,0.000250,0.000250,0.000250,0.000250,0.000250,0.000250,0.000250,-0.037035,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8309,2019-11-12,16356895,4.0,5.0,19.0,LLY,1.0,0.000625,0.000625,0.000625,...,0.000625,0.063122,0.000625,0.000625,0.000625,0.000625,0.000625,0.000625,0.013002,1.0
8003,2019-09-17,16367778,1.0,5.0,1.0,JNJ,1.0,0.000303,0.122921,0.000303,...,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.012689,1.0
7467,2019-11-26,16375047,1.0,6.0,10.0,GSK,0.0,0.001250,0.001250,0.001250,...,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.001250,0.023256,1.0
5060,2019-10-22,16384333,61.0,5.0,18.0,BIIB,0.0,0.000417,0.000417,0.042078,...,0.000417,0.000417,0.000417,0.000417,0.000417,0.000417,0.000417,0.000417,0.334228,1.0


# Save data

In [22]:
filename = 'data/model/app_io.csv.gz'
X_app.to_csv(filename, compression='gzip', index=False)

In [23]:
filename = 'data/model/patent_io.csv.gz'
X_patent.to_csv(filename, compression='gzip', index=False)