In [60]:
# import standard libraries
from datetime import timedelta
import numpy as np
# import third-party libraries
import gensim
from gensim import corpora
import pandas as pd
# import local libraries

# Feature engineering

In [5]:
# load patent data
patents = pd.read_csv('data/patents/clean/patents.csv.gz', compression='gzip')
patents['text'] = patents['text'].apply(lambda x: x.split(' '))

In [15]:
# load returns
stocks = pd.read_csv('data/returns/clean/stock_returns.csv', index_col=0)
stocks.index = pd.to_datetime(stocks.index)
# load s&p500
snp500 = pd.read_csv('data/returns/clean/market_returns.csv', index_col=0)
snp500.index = pd.to_datetime(snp500.index)

In [56]:
# load lda model
num_topics = 15
ldamodel = gensim.models.ldamodel.LdaModel.load(f'data/nlp/model{num_topics}.gensim')

In [20]:
# remove columns from patents
keep_col = ['app_date', 'app_number', 'cited_patent_number',
           'num_inventor', 'patent_num_claims', 'ticker', 'text']

In [21]:
X = patents[keep_col]

In [30]:
X.loc[:, 'app_date'] = pd.to_datetime(X.loc[:, 'app_date'])
X.sort_values(by='app_date', inplace=True)

In [51]:
# create feature with number of patent applications filed in the last 60 days.
delay = 60 # days
X['num_app_prior'] = np.nan
for ticker, x in X.groupby('ticker'):
    for i, patent in x.iterrows():
        app_date_delayed = patent['app_date'] - timedelta(days=delay)
        num_app_prior = len(x[(x['app_date'] >= app_date_delayed) & (x['app_date'] < patent['app_date'])])
        X.at[i, 'num_app_prior'] = num_app_prior

In [65]:
# create feature with topic extraction from lda modela
X[[f'topic{t}' for t in range(num_topics)]] = 0.0

In [61]:
# create corpora and dictionary
dictionary = corpora.Dictionary(list(patents['text'].values))

In [70]:
for i, x in X.iterrows():
    new_doc_bow = dictionary.doc2bow(x['text'])
    t_prob = ldamodel.get_document_topics(new_doc_bow)
    for t in t_prob:
        X.at[i, 'topic'+str(t[0])] = t[1]

In [75]:
# drop columns that are not needed
X = X.drop(columns=['text'])

In [76]:
X

Unnamed: 0,app_date,app_number,cited_patent_number,num_inventor,patent_num_claims,ticker,num_app_prior,topic0,topic1,topic2,...,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14
7568,2010-01-04,12651639,73.0,12.0,9.0,VRTX,0.0,0.566571,0.082374,0.0,...,0.000000,0.000000,0.036103,0.028766,0.053571,0.000000,0.000000,0.000000,0.000000,0.195348
2540,2010-01-04,12651654,6.0,2.0,18.0,MRK,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.055628,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1430,2010-01-04,12651782,13.0,3.0,25.0,NVO,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.420574,0.093662,0.000000,0.000000,0.000000,0.000000,0.345369
7507,2010-01-05,12652152,8.0,4.0,7.0,VRTX,1.0,0.415666,0.000000,0.0,...,0.000000,0.260008,0.000000,0.029020,0.000000,0.068315,0.000000,0.000000,0.000000,0.000000
7718,2010-01-06,12652837,37.0,5.0,3.0,LLY,0.0,0.000000,0.000000,0.0,...,0.358177,0.059831,0.000000,0.000000,0.030276,0.000000,0.000000,0.479128,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7138,2019-03-18,16356895,4.0,5.0,19.0,LLY,1.0,0.803053,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.142778,0.000000,0.000000,0.000000
7137,2019-03-18,16356882,4.0,4.0,16.0,LLY,1.0,0.754726,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.178605,0.000000,0.000000,0.000000
2781,2019-03-28,16367778,1.0,5.0,1.0,JNJ,1.0,0.000000,0.000000,0.0,...,0.308916,0.000000,0.000000,0.032342,0.000000,0.000000,0.000000,0.084904,0.102846,0.000000
4879,2019-04-15,16384333,61.0,5.0,18.0,BIIB,0.0,0.000000,0.000000,0.0,...,0.308028,0.000000,0.000000,0.000000,0.000000,0.154428,0.323510,0.000000,0.103669,0.000000


In [None]:
# generate labels
