In [27]:
import pandas as pd
import numpy as np
import re
import os
import time


In [5]:

os.chdir('Patent Claims')

Exploratory\ Data\ Analysis.ipynb  citations.csv
Patent\ Claims			   office_actions.csv
Patent\ ICL\ ICC.ipynb		   pat_to_year.txt
Scraper				   patent_claims_fulltext.csv
Text\ Cleaner.ipynb		   rejections.csv
Untitled.ipynb			   stanford_patent_data.csv
Untitled1.ipynb


In [6]:
!dir

claim_2000_2014.csv  claim_2000_2014_v001.csv  desktop.ini


In [8]:
df = pd.read_csv('claim_2000_2014_v001.csv')

In [14]:

df.head()

347 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8216150 entries, 0 to 8216149
Data columns (total 3 columns):
pat_no       object
claim_no     int64
claim_txt    object
dtypes: int64(1), object(2)
memory usage: 188.1+ MB


In [15]:

df.isna().sum()

1.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Here we are going to drop rows with no claim text.

We are also going to drop rows that are dependent claims and only focus on claims that are independent claims. All independent claims are marked with a 0


In [12]:
df.dropna(subset=['claim_txt'],inplace=True)

In [13]:
df.head()

Unnamed: 0,pat_no,claim_no,claim_txt
0,8697278,17,17. Battery comprising an interior of the batt...
1,7385756,81,81. A catadioptric projection objective for im...
2,7387146,1,1. A heavy duty tire comprising a tread portio...
3,7387253,43,43. A system comprising: (a) a optical reader ...
4,7387278,17,17. A parachute ripcord pin for holding a para...


Looks like there are some text cleaning that will need to be done.

In [30]:
start = time.time()
df['claim_txt']= df['claim_txt'].apply(lambda x : re.sub('\d{1,}\.\s',"",x))
end = time.time()
print(end - start)

418.2266128063202


In [29]:

start = time.time()

df['claim_txt']= df['claim_txt'].apply(lambda x : x.lower())

end = time.time()
print(end - start)

300.4651348590851


In [31]:
df.head()

Unnamed: 0,pat_no,claim_no,claim_txt
0,8697278,17,battery comprising an interior of the battery ...
1,7385756,81,a catadioptric projection objective for imagin...
2,7387146,1,"a heavy duty tire comprising a tread portion, ..."
3,7387253,43,a system comprising: (a) a optical reader incl...
4,7387278,17,a parachute ripcord pin for holding a parachut...


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8216031 entries, 0 to 8216149
Data columns (total 3 columns):
pat_no       object
claim_no     int64
claim_txt    object
dtypes: int64(1), object(2)
memory usage: 250.7+ MB


Now it's time to determine the ICC or independant claim count( this simply counts the number of independant claims per patent

Next will be the (ICL) or the claim length for the shortest independent claim

In [35]:
#import spacy
#import gensim
import lexnlp


#nlp = spacy.load('en_core_web_lg')

In [52]:
print(lexnlp.nlp.en.__file__)

/home/zdwhite/anaconda3/envs/py36/lib/python3.6/site-packages/lexnlp/nlp/en/__init__.py


In [29]:
## Cleaning text in claims.
## This function is deprecated as i'm not using NLP to tokenize each claim.
## This tokenization would take forever
def txt_clean(claim):
    sent=[]
    for token in nlp(claim):
        if (token.is_stop == False) and (token.pos_!='PUNCT'):
            sent.append(token.text)
    sent = "".join(sent)
    return sent
        

In [62]:
## This is a list of stop words that is taken from the "lexNLP package"
## I saved their list locally as their module was having a tonne of issues this will save you a major headache
legal_STOP = list(pd.read_csv('stopwords.csv')['0'])

In [64]:
# Remove stop words
start = time.time()
df['claim_txt'] = [" ".join([word for word in document.lower().split() if word not in legal_STOP])
       for document in df['claim_txt']]
end = time.time()
print(end - start)

2444.904415369034


In [67]:
## Calculate and append claim length
start = time.time()

df['ICL'] = [len(claim.split()) for claim in df['claim_txt']]

end = time.time()
print(end - start)

39.00309872627258


In [69]:
## Calculate and append number of independent claims
start = time.time()
ICC = df.groupby(by="pat_no").count().reset_index().loc[:,['pat_no','ICL']]
ICC.columns=['pat_no', 'ICC']

df = pd.merge(df,ICC,how='outer',on='pat_no')

end = time.time()
print(end - start)

25.110410690307617


In [70]:
df.head()

Unnamed: 0,pat_no,claim_no,claim_txt,ICL,ICC
0,8697278,17,battery comprising interior battery active ele...,106,2
1,8697278,1,battery cell casing comprising first casing el...,97,2
2,7385756,81,catadioptric projection objective imaging patt...,108,33
3,7385756,94,catadioptric projection objective imaging patt...,116,33
4,7385756,79,catadioptric projection objective imaging patt...,103,33


In [119]:
# I was able to download a complete dataset from the stanford law school I had to clean that dataset

## read in all patent numbers that have been associated with a court case since 2000
df_spl=pd.read_csv('spl_final.csv')
# clean the list
pat_spl = []
df_spl['patents']=df_spl['patents'].astype(str)
for row in df_spl['patents']:
    pat_spl.extend(row.split(';'))
    
pat_spl = list(set(pat_spl))

# clean the pat numbers
pat_spl= [re.sub('\s',"",x) for x in pat_spl]
pat_spl = pd.DataFrame(pat_spl,columns=['pat_no'])
# setting the dependent variable
pat_spl['litigation']=1

In [253]:
# Merge litigated patents as our dependent variable
# drop patents not in our larger dataset
start = time.time()
df = pd.merge(df,pat_spl,how='outer',on='pat_no')
df['litigation'].fillna(0,inplace=True)
df.dropna(axis=0,inplace=True)
end = time.time()
print(end - start)

16.859753608703613


In [254]:
# How imbalanced is our dataset at this point
# HOLY MOLY ~1.4% of all patents issued between 2000 and 2014 were in a litigation case
# just over a 1 in 100 chance of litigation 
df['litigation'].value_counts(normalize=True)

0.0    0.985667
1.0    0.014333
Name: litigation, dtype: float64

In [255]:
df.head()

Unnamed: 0,pat_no,claim_no,claim_txt,ICL,ICC,litigation
0,8697278,17.0,battery comprising interior battery active ele...,106.0,2.0,0.0
1,8697278,1.0,battery cell casing comprising first casing el...,97.0,2.0,0.0
2,7385756,81.0,catadioptric projection objective imaging patt...,108.0,33.0,0.0
3,7385756,94.0,catadioptric projection objective imaging patt...,116.0,33.0,0.0
4,7385756,79.0,catadioptric projection objective imaging patt...,103.0,33.0,0.0


In [257]:
# Compress data types pat number has strings embded in some numbers so can't be casted to str
df['claim_no']=df['claim_no'].astype('int')
df['ICL']=df['ICL'].astype('int')
df['ICC']=df['ICC'].astype('int')
df['litigation']=df['litigation'].astype('int')

In [258]:
# Save the final datset to be modeled
start = time.time()
df.to_csv('claim_2000_2014_cleaned.csv',index=False)
end = time.time()
print(end - start)

137.33904910087585


In [202]:
# This encodes the year that a patent was issued relative to it's patent number
## Possible future work to append the year of the patents issue with the year the patent is litigated

pat_no_year = pd.read_csv('pat_to_year.txt')

In [203]:
pat_no_year.tail(20)

Unnamed: 0,Issue_Year,First_Utility,First_Design,First_Plant,First_Reissue,First_Statutory_Invention_Registration
21,1997,5590420,D377107,PP09776,RE35418,H001623
22,1998,5704062,D388585,PP10172,RE35708,H001701
23,1999,5855021,D403485,PP10743,RE36021,H001766
24,2000,6009555,D418273,PP11169,RE36479,H001826
25,2001,6167569,D435713,PP11728,RE37006,H001930
26,2002,6334220,D452599,PP12314,RE37489,H002008
27,2003,6502244,D468073,PP13447,RE37954,H002057
28,2004,6671884,D484671,PP14441,RE38377,H002093
29,2005,6836899,D500396,PP15460,RE38680,H002113
30,2006,6981282,D513356,PP16176,RE38928,H002137
