In [1]:
import re
from math import log
# import Python packages as needed
# NOTE: you can choose to install/use external packages

# Question 1: word association mining

## Basic statistics of the corpus

In [2]:
review_filepath = './amazon_reviews.txt'

In [3]:
# calculate the number of reviews
num_reviews = 0
num_positive_reviews = 0
num_negative_reviews = 0
with open(review_filepath) as f:
    f.readline() # skip header (the first line) 
    for line in f:
        num_reviews += 1
        if line.strip().split()[0] == '1':
            num_positive_reviews += 1
        else:
            num_negative_reviews += 1
print ('total number of reviews:', num_reviews)
print ('total number of positive reviews:', num_positive_reviews)
print ('total number of negative reviews:', num_negative_reviews)

total number of reviews: 30000
total number of positive reviews: 15091
total number of negative reviews: 14909


## Count frequency of single words

In [4]:
def process_text(text):
    # replace punctuations with blank spaces
    for punctuations in [',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']']:
        text = text.replace(punctuations, ' ')
    # eliminate duplicated whitespaces using wildcards
    text = re.sub('\s+', ' ', text)
    # convert to lowercases
    text = text.lower().strip()
    return text

In [5]:
# Count the frequency of single words (aka. unigrams) in the corpus
# Parameter:
#       filepath: file path of amazon_review.txt
# Return: 
#       a dictionary, key = word, value = word frequency

def get_single_word_frequency(filepath):
    word_freq = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            # split tabs(arrows) at the beginning and the following reviews
            review_text = process_text(line.split('\t')[1])
            # split reviews into single words
            for word in review_text.split():
                if word not in word_freq:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1
    return word_freq

In [6]:
# list every single word and its frequency
word_freq = get_single_word_frequency(review_filepath)

# By using a key for sorted, we can sort the values according to the result of applying the key function to each value. Since lambda creates a callable (specifically, a function), we can use one for the key. Here the result is sorted in terms of the frequency, which the index is 1.
for word, freq in sorted(word_freq.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(word, freq)

the 119835
and 64619
i 63045
a 60750
to 57968
it 47813
of 47382
this 44363
is 41185
in 27962


In [7]:
total_num_words = sum(word_freq.values())
print ('number of unique words:', len(word_freq))
print ('total number of word occurrences:', total_num_words)

number of unique words: 69037
total number of word occurrences: 2384094


## Count frequency of ordered pair of words in a text window

In [8]:
# Count the number of text windows that contain an ordered pair of words
# Parameter:
#       filepath: file path of amazon_review.txt
#       window_size: the size of a text window (measured in number of words)
# Return: 
#       a dictionary, key = ordered word pair (a tuple), 
#                     value = number of text windows containing this pair

def get_ordered_word_pair_frequency(filepath, window_size):
    pair_freq = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            review_text = process_text(line.split('\t')[1])
            word_list = review_text.split()
            for i in range(len(word_list)):
                for j in range(i + 1, len(word_list)):
                    # only consider pairs of words no more than window_size apart  
                    if j - i + 1 >= window_size:
                        break
                    # put this ordered word pair into a tuple
                    order_word_pair = (word_list[i], word_list[j])
                    # accumulate counts
                    if order_word_pair not in pair_freq:
                        pair_freq[order_word_pair] = 1
                    else:
                        pair_freq[order_word_pair] += 1
    return pair_freq

In [9]:
TEXT_WINDOW_SIZE = 5
pair_freq = get_ordered_word_pair_frequency(review_filepath, TEXT_WINDOW_SIZE)
for pair, freq in sorted(pair_freq.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(pair, freq)

('of', 'the') 15208
('the', 'of') 12728
('to', 'the') 11803
('this', 'is') 10832
('the', 'the') 10615
('and', 'the') 10479
('in', 'the') 9092
('the', 'and') 8728
('the', 'is') 8420
('is', 'a') 8106


## Calculate pointwise mutual information for each ordered pair

In [10]:
# calculate the pointwise mutual information for this pair of words 
# PMI(xi, yj) = log P(X = xi, Y = yj)/ P(X = xi)P(Y = yj)

WORD_PAIR_FREQUENCY_THRESHOLD = 50
pmi_per_pair = {}
for pair, freq in pair_freq.items():
    if freq < WORD_PAIR_FREQUENCY_THRESHOLD: 
    # filter out infrequent word pairs
        continue
        
    if pair[0] in word_freq and pair[1] in word_freq:
        # freq: frequency of this word pair
        # word_freq[pair[0]]: frequency of the first word in the pair, notice it is not pair_freq
        # word_freq[pair[1]]: frequency of the second word in the pair, notice it is not pair_freq
        # total_num_words: total number of words in the corpus (i.e. corpus size), notice it is word occurrence
        Pxy = freq/total_num_words
        Px = word_freq[pair[0]]/total_num_words
        Py = word_freq[pair[1]]/total_num_words
        pmi_per_pair[pair] = log(Pxy/(Px*Py))
        continue 

In [11]:
# sort word pairs in pmi_per_pair by their PMI from highest to lowest. Show the top 10 pairs.
print('---Pointwise Mutual Information---')
for pair, pmi in sorted(pmi_per_pair.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(pair, pmi)

---Pointwise Mutual Information---
('blah', 'blah') 9.949864089900075
('sci', 'fi') 9.696379184529272
('hip', 'hop') 9.670191811272797
('harry', 'potter') 9.626615142058197
('stainless', 'steel') 9.42888259103066
('blu', 'ray') 8.925332853777995
('buyer', 'beware') 8.688618897930821
('windows', 'xp') 8.453848288341772
('tech', 'support') 7.970746600541408
('web', 'site') 7.969335218781741


**Comment**: from the result above I would say most of them make sense.

# Question 2: feature selection using Chi-square statistic

## For each word, count how many positive (negative) documents it appears in

In [12]:
# Count the number of documents that has a specified sentiment and contain a single word  
# Parameter:
#       filepath: file path of amazon_review.txt
#       label: string '0' (negative) or '1' (positive).   
# Return: 
#       a dictionary, key = word, value = word frequency

def get_single_word_doc_frequency_per_label(filepath, label):
    word_freq_per_label = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            sentiment_label = line.split('\t')[0].strip()
            if sentiment_label == label:
                review_text = process_text(line.split('\t')[1])
                for word in set(review_text.split()):
                    if word not in word_freq_per_label:
                        word_freq_per_label[word] = 1
                    else:
                        word_freq_per_label[word] += 1
    return word_freq_per_label

In [13]:
# number of positive documents that contain a word 
positive_word_freq = get_single_word_doc_frequency_per_label(review_filepath, '1')
for word, freq in sorted(positive_word_freq.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(word, freq)

the 13245
and 12526
a 11891
to 11156
this 11082
i 10334
is 10106
of 9998
it 9793
in 8182


In [14]:
# number of negative documents that contain a word 
negative_word_freq = get_single_word_doc_frequency_per_label(review_filepath, '0')
for word, freq in sorted(negative_word_freq.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(word, freq)

the 13615
and 11743
a 11675
to 11463
i 11415
this 11379
it 10358
of 10128
is 9420
not 8236


## Calculate Chi-square statistic for each word

In [15]:
# contingency table per word:
#                                             sentiment
#                       positive                            negative
#               ------------------------------------------------------------------------
#       present | word present, positive sentiment | word present, negative sentiment |  
# word          ------------------------------------------------------------------------
#       absent  | word absent,  positive sentiment | word absent, negative sentiment  |  
#               ------------------------------------------------------------------------
#     

In [16]:
chisqr_per_word = {}
for word, freq in word_freq.items():
    # filter infrequent words
    if freq < 10:
        continue
    if word in positive_word_freq and word in negative_word_freq:        
        # calculate the Chi-square statistic for this word
        # use the following variables
        # positive_word_freq[word]: number of positive reviews where this word is present
        # negative_word_freq[word]: number of negative reviews where this word is present 
        # num_positive_reviews: number of positive reviews in total 
        # num_negative_reviews: number of negative reviews in total
        # num_reviews: total number of reviews in the corpus
        
        # Observed count of cells
        Obs_1_1 = positive_word_freq[word]
        Obs_1_0 = negative_word_freq[word]
        Obs_0_1 = num_positive_reviews - Obs_1_1
        Obs_0_0 = num_negative_reviews - Obs_1_0
        
        # Expected count of cells
        exp_1_1 = num_positive_reviews * (Obs_1_1 + Obs_1_0)/num_reviews
        exp_1_0 = num_negative_reviews * (Obs_1_1 + Obs_1_0)/num_reviews
        exp_0_1 = num_positive_reviews * (Obs_0_1 + Obs_0_0)/num_reviews
        exp_0_0 = num_negative_reviews * (Obs_0_1 + Obs_0_0)/num_reviews
        
        # Summation 
        chi_1_1 = (Obs_1_1 - exp_1_1)**2/exp_1_1
        chi_1_0 = (Obs_1_0 - exp_1_0)**2/exp_1_0
        chi_0_1 = (Obs_0_1 - exp_0_1)**2/exp_0_1
        chi_0_0 = (Obs_0_0 - exp_0_0)**2/exp_0_0
        
        chisqr_per_word[word] = chi_1_1 + chi_1_0 + chi_0_1 + chi_0_0
        
        continue
        

In [17]:
# sort words in chi2_per_word by their Chi-square value from highest to lowest. Show the top 10 words.
# words that clearly associate with positive or negative sentiment
print('---Chi-Square---')
for word, chisqr in sorted(chisqr_per_word.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(word, chisqr)

---Chi-Square---
great 2259.208635662306
not 2016.4733034985088
waste 1300.8223017245875
money 1204.9485759018141
love 769.4901479541076
best 757.4894506044782
poor 667.987842854487
worst 663.642219565316
excellent 646.8180379179539
disappointed 636.8233634415676


In [18]:
# words that do not clearly associate with positive or negative sentiment
print('---Chi-Square---')
for word, chisqr in sorted(chisqr_per_word.items(), key = lambda x: x[1] < 3.841, reverse = True)[:10]:
    print(word, chisqr)

---Chi-Square---
all 2.6200474577313497
seals 0.10348902656726948
a 1.0553774185217923
old 1.764604534350523
one 2.185089539818895
neighbors 1.3179408693942627
neighbor 2.0476351063038174
worth 0.7866421059068137
chose 0.002385381442201088
first 2.5235155922170582


null hypothesis：the certain word has no correlation with the sentiment.

alternative hypothesis: the certain word has correlation with the sentiment.

degree of freedom = 1 * 1 = 1, we set α = 0.05, the corresponding value of χ² = 3.841

For words with χ² > 3.841, the null hypothesis is not true. we have 95% confidence to say they are correlated with the sentiment.

For words with χ² < 3.841, we can not turn down the null hypothesis. we have 95% confidence to say they are not correlated with the sentiment.

In [19]:
# another way to do the chisqr test by using the stats package 
# (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html)
from scipy import stats

chisqr_per_word = {}
for word, freq in word_freq.items():
    # filter infrequent words
    if freq < 10:
        continue
    if word in positive_word_freq and word in negative_word_freq:          
        # Observed count of cells
        Obs_1_1 = positive_word_freq[word]
        Obs_1_0 = negative_word_freq[word]
        Obs_0_1 = num_positive_reviews - Obs_1_1
        Obs_0_0 = num_negative_reviews - Obs_1_0
        
        observed = [[Obs_1_1,Obs_1_0],[Obs_0_1,Obs_0_0]]
        out = stats.chi2_contingency(observed=observed,correction=False)
        chisqr_per_word[word] = out[0]
        continue

print('---Chi-Square---')
for word, chisqr in sorted(chisqr_per_word.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(word, chisqr)

---Chi-Square---
great 2259.208635662306
not 2016.4733034985088
waste 1300.8223017245875
money 1204.9485759018141
love 769.4901479541076
best 757.4894506044782
poor 667.987842854487
worst 663.642219565316
excellent 646.8180379179539
disappointed 636.8233634415676


# Question 3: spell correction using letter n-grams

In [20]:
# import the file
a_list_filepath = './enwiktionary.a.list'

In [21]:
# add a new object at the end of the list, remove line breaks from data 
a_list = []
with open(a_list_filepath) as f:
    for line in f:
        a_list.append(line.strip())

In [22]:
print ('number of words/phrases in the list:', len(a_list))

number of words/phrases in the list: 305868


In [23]:
# Represent a string with a set of n-grams
def chunk_word_into_letter_ngrams(word, n):
    ngrams = []
    for i in range(len(word)-n+1):
        ngrams.append( word[i : i+n] )
    return set(ngrams)

In [24]:
print (chunk_word_into_letter_ngrams('hello world', 3))

{'o w', 'wor', 'rld', 'lo ', 'llo', ' wo', 'hel', 'orl', 'ell'}


In [25]:
# You need a function that calculates the edit distance for any pair of words
# (You can use an external package to calculate edit distance, e.g. the "editdistance" package)
import editdistance

def editdistance_sort(string,wordlist):
    sort = {}
    for word in wordlist:
        sort[word] = editdistance.eval(string,word)
    return sorted(sort.items(), key = lambda x: x[1])

In [26]:
# For each given string, you need to find a list of 10 correctly-spelled words from enwiktionary.a.list 
# that have the _lowest_ edit distance to the given word

print('---abreviation(Edit_tri)---')
print(editdistance_sort('abreviation',a_list)[:10])
print('\n---abstrictiveness(Edit_tri)---')
print(editdistance_sort('abstrictiveness',a_list)[:10]) 
print('\n---accanthopterigious(Edit_tri)---')
print(editdistance_sort('accanthopterigious',a_list)[:10]) 
print('\n---artifitial inteligwnse(Edit_tri)---')
print(editdistance_sort('artifitial inteligwnse',a_list)[:10])  
print('\n---agglumetation(Edit_tri)---')
print(editdistance_sort('agglumetation',a_list)[:10]) 

---abreviation(Edit_tri)---
[('abbreviation', 1), ('abbreviatio', 2), ('abbreviations', 2), ('alleviation', 2), ('abbreviationi', 2), ('abbreviatione', 2), ('abreviaron', 2), ('adbreviatio', 2), ('adbreviationi', 2), ('adbreviatione', 2)]

---abstrictiveness(Edit_tri)---
[('abstractiveness', 1), ('absorptiveness', 3), ('attractiveness', 3), ('abortiveness', 4), ('abstersiveness', 4), ('abstractedness', 4), ('abstractness', 4), ('assertiveness', 4), ('abstrictions', 4), ('attributiveness', 4)]

---accanthopterigious(Edit_tri)---
[('acanthopterygious', 2), ('acanthopterous', 4), ('acanthopterygians', 4), ('acanthopterygian', 5), ('acanthopterygii', 5), ('acanthopodious', 6), ('acanthophorous', 6), ('acanthopteri', 6), ('acanthopterans', 6), ('acanthocarpous', 7)]

---artifitial inteligwnse(Edit_tri)---
[('artificial intelligence', 4), ('artificial intelligences', 5), ('artificial life', 9), ('artificial insemination', 9), ('artificialities', 9), ('artificial horizons', 9), ('artificial p

In [27]:
# You also need a function that can calculate the Jaccard similarity for any pair of words
def Jaccard_Similarity(A,B):
    intersection = A & B
    union = A | B
    J = len(intersection)/len(union)
    return J

In [28]:
# For each given string, you need to find a list of 10 correctly-spelled words from enwiktionary.a.list
# that have the _highest_ n-gram Jaccard similarity to the given word
# Different lengths of the n-grams (i.e., different n) will likely produce a different list 

**Comment**: from the result below I would say the trigrams work best. (4-grams and 5-grams do not get the right result for the 5th word "agglutination".

In [29]:
# Jaccard Similarity-trigrams
def Jaccard_tri(string, wordlist):
    trigram_str = chunk_word_into_letter_ngrams(string,3)
    sort = {}
    for word in wordlist:
        trigram_word = chunk_word_into_letter_ngrams(word, 3)
        sort[word] = Jaccard_Similarity(trigram_str,trigram_word)
    return sorted(sort.items(),key = lambda x: x[1],reverse = True) 

In [30]:
print('---abreviation(Jacc_tri)---')
print(Jaccard_tri('abreviation',a_list)[:10]) 
print('\n---abstrictiveness(Jacc_tri)---')
print(Jaccard_tri('abstrictiveness',a_list)[:10]) 
print('\n---accanthopterigious(Jacc_tri)---')
print(Jaccard_tri('accanthopterigious',a_list)[:10]) 
print('\n---artifitial inteligwnse(Jacc_tri)---')
print(Jaccard_tri('artifitial inteligwnse',a_list)[:10])  
print('\n---agglumetation(Jacc_tri)---')
print(Jaccard_tri('agglumetation',a_list)[:10]) 

---abreviation(Jacc_tri)---
[('abbreviation', 0.7272727272727273), ('abbreviations', 0.6666666666666666), ('abbreviationi', 0.6666666666666666), ('abbreviatione', 0.6666666666666666), ('adbreviationi', 0.6666666666666666), ('adbreviatione', 0.6666666666666666), ('abbreviatio', 0.6363636363636364), ('adbreviatio', 0.6363636363636364), ('abreviativo', 0.6363636363636364), ('abreviativa', 0.6363636363636364)]

---abstrictiveness(Jacc_tri)---
[('abstractiveness', 0.625), ('activeness', 0.5), ('addictiveness', 0.5), ('astrictive', 0.5), ('abstriction', 0.4666666666666667), ('abstricting', 0.4666666666666667), ('astrictives', 0.4666666666666667), ('abstrict', 0.46153846153846156), ('abstrictions', 0.4375), ('activenesses', 0.4375)]

---accanthopterigious(Jacc_tri)---
[('acanthopterygious', 0.55), ('acanthopteri', 0.5294117647058824), ('acanthopterous', 0.47368421052631576), ('acanthopteran', 0.42105263157894735), ('acanthopterans', 0.4), ('acanthopterygii', 0.38095238095238093), ('acanthopte

In [31]:
# Jaccard Similarity-bigrams
def Jaccard_bi(string, wordlist):
    bigram_str = chunk_word_into_letter_ngrams(string,2)
    sort = {}
    for word in wordlist:
        bigram_word = chunk_word_into_letter_ngrams(word,2)
        sort[word] = Jaccard_Similarity(bigram_str,bigram_word)
    return sorted(sort.items(),key = lambda x: x[1],reverse = True)

In [32]:
print('---abreviation(Jacc_bi)---')
print(Jaccard_bi('abreviation',a_list)[:10]) 
print('\n---abstrictiveness(Jacc_bi)---')
print(Jaccard_bi('abstrictiveness',a_list)[:10]) 
print('\n---accanthopterigious(Jacc_bi)---')
print(Jaccard_bi('accanthopterigious',a_list)[:10]) 
print('\n---artifitial inteligwnse(Jacc_bi)---')
print(Jaccard_bi('artifitial inteligwnse',a_list)[:10])  
print('\n---agglumetation(Jacc_bi)---')
print(Jaccard_bi('agglumetation',a_list)[:10])   

---abreviation(Jacc_bi)---
[('abbreviation', 0.9090909090909091), ('abbreviations', 0.8333333333333334), ('abbreviationi', 0.8333333333333334), ('abbreviatione', 0.8333333333333334), ('abbreviatio', 0.8181818181818182), ('abbreviationis', 0.7692307692307693), ('abbreviationem', 0.7692307692307693), ('abbreviationes', 0.7692307692307693), ('abbreviationum', 0.7692307692307693), ('abbreviati', 0.7272727272727273)]

---abstrictiveness(Jacc_bi)---
[('abstractiveness', 0.75), ('astrictives', 0.6), ('activeness', 0.5333333333333333), ('abstivesen', 0.5333333333333333), ('astrictive', 0.5333333333333333), ('abstivesse', 0.5333333333333333), ('abstivesses', 0.5333333333333333), ('addictiveness', 0.5294117647058824), ('absorptiveness', 0.5), ('abstersiveness', 0.5)]

---accanthopterigious(Jacc_bi)---
[('acanthopterygious', 0.7368421052631579), ('acanthopterous', 0.6666666666666666), ('acanthopteri', 0.6470588235294118), ('acanthopteran', 0.5555555555555556), ('acanthopterygian', 0.55), ('acanth

In [33]:
# Jaccard Similarity-4grams
def Jaccard_four(string, wordlist):
    fourgram_str = chunk_word_into_letter_ngrams(string,4)
    sort = {}
    for word in wordlist:
        fourgram_word = chunk_word_into_letter_ngrams(word, 4)
        sort[word] = Jaccard_Similarity(fourgram_str,fourgram_word)
    return sorted(sort.items(),key = lambda x: x[1],reverse = True)

In [34]:
print('---abreviation(Jacc_four)---')
print(Jaccard_four('abreviation',a_list)[:10]) 
print('\n---abstrictiveness(Jacc_four)---')
print(Jaccard_four('abstrictiveness',a_list)[:10]) 
print('\n---accanthopterigious(Jacc_four)---')
print(Jaccard_four('accanthopterigious',a_list)[:10]) 
print('\n---artifitial inteligwnse(Jacc_four)---')
print(Jaccard_four('artifitial inteligwnse',a_list)[:10]) 
print('\n---agglumetation(Jacc_four)---')
print(Jaccard_four('agglumetation',a_list)[:10]) 

---abreviation(Jacc_four)---
[('abbreviation', 0.7), ('abbreviations', 0.6363636363636364), ('abbreviationi', 0.6363636363636364), ('abbreviatione', 0.6363636363636364), ('adbreviationi', 0.6363636363636364), ('adbreviatione', 0.6363636363636364), ('abbreviatio', 0.6), ('adbreviatio', 0.6), ('abreviativo', 0.6), ('abreviativa', 0.6)]

---abstrictiveness(Jacc_four)---
[('abstractiveness', 0.5), ('addictiveness', 0.4666666666666667), ('activeness', 0.46153846153846156), ('astrictive', 0.46153846153846156), ('abstriction', 0.42857142857142855), ('abstricting', 0.42857142857142855), ('astrictives', 0.42857142857142855), ('abstrict', 0.4166666666666667), ('abstrictions', 0.4), ('activenesses', 0.4)]

---accanthopterigious(Jacc_four)---
[('acanthopteri', 0.5), ('acanthopterygious', 0.45), ('acanthopteran', 0.3888888888888889), ('acanthopterous', 0.3684210526315789), ('acanthopterans', 0.3684210526315789), ('acanthopterygii', 0.35), ('acanthopterygian', 0.3333333333333333), ('acanthopterygian

In [35]:
# Jaccard Similarity-5grams
def Jaccard_five(string, wordlist):
    fivegram_str = chunk_word_into_letter_ngrams(string,5)
    sort = {}
    for word in wordlist:
        fivegram_word = chunk_word_into_letter_ngrams(word,5)
        sort[word] = Jaccard_Similarity(fivegram_str,fivegram_word)
    return sorted(sort.items(),key = lambda x: x[1],reverse = True)

In [36]:
print('---abreviation(Jacc_five)---')
print(Jaccard_five('abreviation',a_list)[:10]) 
print('\n---abstrictiveness(Jacc_five)---')
print(Jaccard_five('abstrictiveness',a_list)[:10]) 
print('\n---accanthopterigious(Jacc_five)---')
print(Jaccard_five('accanthopterigious',a_list)[:10]) 
print('\n---artifitial inteligwnse(Jacc_five)---')
print(Jaccard_five('artifitial inteligwnse',a_list)[:10])  
print('\n---agglumetation(Jacc_five)---')
print(Jaccard_five('agglumetation',a_list)[:10])  

---abreviation(Jacc_five)---
[('abbreviation', 0.6666666666666666), ('abbreviations', 0.6), ('abbreviationi', 0.6), ('abbreviatione', 0.6), ('adbreviationi', 0.6), ('adbreviatione', 0.6), ('abbreviatio', 0.5555555555555556), ('adbreviatio', 0.5555555555555556), ('abreviativo', 0.5555555555555556), ('abreviativa', 0.5555555555555556)]

---abstrictiveness(Jacc_five)---
[('addictiveness', 0.42857142857142855), ('activeness', 0.4166666666666667), ('astrictive', 0.4166666666666667), ('abstriction', 0.38461538461538464), ('abstricting', 0.38461538461538464), ('astrictives', 0.38461538461538464), ('abstractiveness', 0.375), ('abstrict', 0.36363636363636365), ('abstrictions', 0.35714285714285715), ('activenesses', 0.35714285714285715)]

---accanthopterigious(Jacc_five)---
[('acanthopteri', 0.4666666666666667), ('acanthopteran', 0.35294117647058826), ('acanthopterygious', 0.35), ('acanthopterous', 0.3333333333333333), ('acanthopterans', 0.3333333333333333), ('acanthopterygii', 0.315789473684210