NLP on words without stemming

In [1]:
# Import Packages
import pandas as pd
from nltk import word_tokenize
from gensim import models, corpora
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
import re
from gensim import similarities
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
import pyLDAvis.sklearn
output_notebook()

In [2]:
# Import the cleaned data
data = pd.read_csv("cleaned_data.csv")

In [3]:
# Remove first column
data =data.iloc[:,1:3]

In [4]:
# Let's look at the shape of the dataset
data.shape

(3890, 2)

In [5]:
# Tokenized the words for modeling
data['sentence'] = data['sentence'].apply(lambda x: word_tokenize(x) )

### LDA Model 
-> for Topic Modeling

In [6]:
# Create dictionary on the tokenzied data
dictionary = corpora.Dictionary(data['sentence'])

# Create corpus on the data
corpus = [dictionary.doc2bow(w) for w in data['sentence']]

# Set number of topics to 15
topics = 15

# Create LDA model
lda= models.LdaModel(corpus=corpus, num_topics=topics, id2word=dictionary)

In [7]:
for i in range(topics):
    print("[Topic %s] :" % i, lda.print_topic(i, 15))

[Topic 0] : 0.011*"act" + 0.009*"evidence" + 0.008*"court" + 0.008*"would" + 0.008*"said" + 0.007*"notice" + 0.007*"made" + 0.006*"may" + 0.006*"applicant" + 0.005*"also" + 0.005*"case" + 0.005*"order" + 0.005*"time" + 0.005*"first" + 0.005*"claim"
[Topic 1] : 0.014*"court" + 0.012*"application" + 0.010*"claim" + 0.010*"would" + 0.009*"made" + 0.009*"act" + 0.007*"may" + 0.006*"applicant" + 0.005*"said" + 0.005*"appeal" + 0.005*"whether" + 0.005*"scheme" + 0.005*"evidence" + 0.005*"case" + 0.005*"order"
[Topic 2] : 0.038*"tribunal" + 0.020*"appellant" + 0.013*"decision" + 0.012*"applicant" + 0.010*"evidence" + 0.009*"application" + 0.008*"court" + 0.008*"would" + 0.008*"appeal" + 0.007*"made" + 0.007*"act" + 0.007*"review" + 0.007*"information" + 0.007*"hearing" + 0.007*"federal"
[Topic 3] : 0.014*"court" + 0.013*"act" + 0.011*"would" + 0.008*"made" + 0.008*"may" + 0.007*"order" + 0.007*"respondent" + 0.006*"application" + 0.006*"evidence" + 0.005*"first" + 0.005*"claim" + 0.005*"case"

In [18]:
# Let's try some sentence from the document and see which topic it belongs to
def clean(text):
    stop = stopwords.words('english')
    text = text.lower()
    text = word_tokenize(text)
    return text

In [19]:
# Got this text from 06_13.xml which has name: Skymaker Holdings Pty Ltd v Jadjet Pty Ltd [2006] FCA 13 (20 January 2006)
text = """The Brochure represented that the child care business had a weekly turnover of $7250,
was strong and profitable in trading and had the potential to expand.In particular,
I held that it was open to the respondents to submit that the Commonwealth was precluded from recovering its loss for the reasons outlined in their letters to the applicants and to contest the recoverability and quantification of any loss specified by the applicants."""

In [20]:
text = re.sub(r'[^\w\s]','',text)

In [25]:
text

'The Brochure represented that the child care business had a weekly turnover of 7250\nwas strong and profitable in trading and had the potential to expandIn particular\nI held that it was open to the respondents to submit that the Commonwealth was precluded from recovering its loss for the reasons outlined in their letters to the applicants and to contest the recoverability and quantification of any loss specified by the applicants'

In [21]:
bow = dictionary.doc2bow(clean(text)) 
print(lda[bow]) 

[(5, 0.3610212), (6, 0.5977088)]


This means this paragrah belongs to the topic 11 with probability of 95.56%
Let's see the similar document with this one and give the names of the most 5 similar documents

In [22]:
# Get the similar topics 
lda_index = similarities.MatrixSimilarity(lda[corpus])

similarities = lda_index[lda[bow]]

# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda i: -i[1])

In [23]:
# Let's see what's the most 5 similar documents from the original documents
for i in similarities[:5]:
    print(data.iloc[i[0],0])
    print("=="*30)

E amp J Gallo Winery v Lion Nathan Australia Pty Limited 2008 FCA 934 20 June 2008
Television Food Network GP v Food Channel Network Pty Ltd No 2 2009 FCA 271 27 March 2009
Global Brand Marketing Inc v YD Pty Ltd 2008 FCA 605 7 May 2008
Colorado Group Limited v Strandbags Group Pty Limited No 2 2006 FCA 880 7 July 2006
Bing Software Pty Ltd v Bing Technologies Pty Limited No 1 2008 FCA 1760 25 November 2008


In [53]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_df = 0.9, min_df = 0.01, stop_words = 'english', lowercase = True,
                      token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectdata= vect.fit_transform(clean_data['sentence'])

In [54]:
trunc= TruncatedSVD(n_components=2)
t_data = vectdata.T
trunc_word = trunc.fit_transform(t_data)
 
temp = pd.DataFrame(columns=['x', 'y', 'text'])

temp['text'] = vect.get_feature_names()
temp['x'] = trunc_word[:,0]
temp['y'] = trunc_word[:, 1]
source = ColumnDataSource(ColumnDataSource.from_df(temp))

plt = figure(plot_width= 600, plot_height=550)
plt.circle("x", "y", size=10, source=source, fill_alpha=0.5)
label = LabelSet(x = 'x', y = 'y', text = 'text', 
                 text_font_size = '10pt', text_align = 'center', 
                 source = source, y_offset = 5)
plt.add_layout(label)
show(plt, notebook_handle=True)

pyLDAvis.sklearn package is good for visualizing the Topic Model. Let's use it to show

Because this package need the LDA model created by sklearn. So I need to create new one now

In [86]:
df = pd.read_csv("cleaned_data.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,name,sentence
0,0,University of Western Australia v Gray No 21 2...,application brought university western gray me...
1,1,Parker v Parker 2009 FCA 930 25 August 2009,introduction applicant review decision adminis...


In [89]:
df = pd.read_csv("cleaned_data.csv")
vect = CountVectorizer(min_df=0.01, max_df=0.9)
vect_data = vect.fit_transform(df.iloc[:,2])
 
# Build LDA model
n_topics= 15
lda= LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='online')
lda_Z = lda.fit_transform(vect_data)

In [90]:
# Dash board with relative words is showed below
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, vect_data, vect, mds='tsne')

panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
