## Competitive Intelligence: What has Tech Company X been up to?
### (Patent Analysis)

In [None]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import nltk
nltk.download("popular")

## Patent document - example
http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=8&f=G&l=50&co1=AND&d=PTXT&s1=skorokhod.INNM.&s2=xerox.ASNM.&OS=IN/skorokhod+AND+AN/xerox&RS=IN/skorokhod+AND+AN/xerox

#### We have about 400 abstracts from patents filed by "Tech Company X" over the past couple of years

In [None]:
abstractRaw = pd.read_csv('https://raw.githubusercontent.com/wsko/Text_Analytics_Case_Studies/master/abstract.csv').abstract.values
abstractRaw[:5]

#### Let's vectorize the abstracts using the process similar to what we did in the previous case study

In [None]:
def cleantext(s):
  z = word_tokenize(s)
  z = [word for word in z if word.lower() not in stopwords.words('english')]##remove stopwords
  z = [word.lower() for word in z]##convert everything to lower case
  #z = [stemmer.stem(word) for word in z]##stemming
  z = [word for word in z if word not in [".", ",", " ", "first", "second", "may", "provid", "may", "least"]]##remove custom tokens
  return(' '.join(z))

In [None]:
print(abstractRaw[0])
print("  ")
print(cleantext(abstractRaw[0]))

In [None]:
abstract = abstractRaw.copy()
for i in range(len(abstractRaw)):
    abstract[i] = cleantext(abstract[i])

In [None]:
abstract[:5]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec.fit(abstract)
dtm = vec.transform(abstract)##docuent-term matrix

In [None]:
vocab = pd.DataFrame({'Number' : list(vec.vocabulary_.values()), 'Key' : list(vec.vocabulary_.keys())})
vocab.sort_values(by = 'Number', inplace = True)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer().fit(dtm)
TFIDF = tfidf.transform(dtm)
X = TFIDF.toarray()

In [None]:
X.shape

In [None]:
topWords = pd.DataFrame({"Keys" : vocab.Key, 'Freq': X.sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)
DTM = pd.DataFrame(X)
DTM.columns = vocab.Key.values
DTM[topWords.Keys.values[:20]].head(10)

#### Next, find document clusters

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5).fit(X)
ClusterID = KMeans(n_clusters=5, random_state=None).fit_predict(X)

In [None]:
pd.Series(ClusterID).value_counts()

In [None]:
for i in range(5):
  print("ClusterID", i, pd.DataFrame({"Keys" : vocab.Key, 'Freq': X[ClusterID == i].sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)['Keys'].values[:8])

#### Finally, visualize each patent cluster as a word cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt 

In [None]:
N = 2
weights = pd.DataFrame({"Keys" : vocab.Key, 'Freq': X[ClusterID == N, :].sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)[:100]
weights = dict([tuple(x) for x in weights.to_numpy()])
print("ClusterID", N, pd.DataFrame({"Keys" : vocab.Key, 'Freq': X[ClusterID == N].sum(axis = 0)}).sort_values(by = 'Freq', ascending = False)['Keys'].values[:8])
wordcloud = WordCloud(width = 400, height = 400, background_color ='white', min_font_size = 10).generate_from_frequencies(weights)
plt.figure(figsize = (6, 6), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
#plt.tight_layout(pad = 0)
plt.show() 

#### Bonus: how to web scrape patent abstracts

In [None]:
import requests
from bs4 import BeautifulSoup
pnum = "10645525"
url_1 = 'http://patft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1='
url_2 = '.PN.&OS=PN/'
url_3 = '&RS=PN/'
url = url_1+pnum+url_2+pnum+url_3+pnum
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
page.close()
list(list(soup.children)[14].stripped_strings)[0]