In [16]:
# Loading all required packages
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
from gensim.models import KeyedVectors
import nltk
import numpy as np

In [17]:
# Loading Google Word2Vec model 
def read_word2vec(path):
    # Using First 50000 Word2Vecs
    word2vec = KeyedVectors.load_word2vec_format(path, binary=True, limit=50000)
    return word2vec

In [18]:
# Creating and normalizing word vectors
def average_word_vectors(words, model, vocabulary, num_features):
    # Creating an array of zero values.
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    # Adding word vectors
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    # normalizing word vectors
    if nwords:
        feature_vector = np.divide(feature_vector, nwords) 
        # Output is an array wherein we have one vector corresponding to each corpus
    return feature_vector

In [19]:
def averaged_word_vectorizer(corpus, model, num_features):
    # Unique set of words inside the corpus
    vocabulary = set(model.wv.index2word)
    # Reading word vectors for each sentence and each word of the sentence
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [20]:
#      """ Reading excel file containing text to be processed and classified
#     It will be compatible as follows:
#     Args:
#         path (str): path to file
#         sheet (str: sheet name
#         columns (list): column names containing text to be processed

#     Returns:
#         list of feedbacks, where one feedback can contain multiple sentences

#     Example:
#         feedback_list = get_feedback_text('path/to/file.xlsx', 'Sheet Name', ['col1', 'col2'])
#     """
def read_file(path):
    return pd.read_excel(path)

In [21]:
def preprocess(df, l_c):
    # Consider two columns of feedbacks and topics corresponding to eeach feedback
    data = pd.DataFrame(columns=['text', 'topic'])
    for c in l_c:
        for index, row in df.iterrows():
            text = str(row[c])
            if((text!=' ')and(text!='nan')):
                i = len(data)
                # replacing all special char with space
                text = re.sub('[^a-zA-Z0-9\n]', ' ', text)
                # replacing multiple spaces with single space
                text = re.sub('\s+', ' ', text)
                # bring whole text to same lower-case scale.
                data.at[i, 'text'] = text.lower()
                data.at[i, 'topic'] = c
    data.text.dropna(inplace = True)
    data = data.replace({'topic':{l_c[0]: 't0', l_c[1]: 't1', l_c[2]: 't2', l_c[3]: 't3'}})
    return data

In [22]:
# Extract tokens from document
def tokenized_text(data):
    wpt = nltk.WordPunctTokenizer()
    tokenized_corpus = [wpt.tokenize(document) for document in data.text]
    return tokenized_corpus

In [23]:
# K-means clustering model 
def fit_model(X, true_k):
    # true_k is equal to number of clusters
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=1, random_state=1 )
    model.fit(X)
    data.loc[:,'labels'] = model.labels_
    return model

In [24]:
# Finding top topics in each cluster
def terms_per_cluster(trained, terms):
    print("Top terms per cluster:")
    order_centroids = trained.cluster_centers_.argsort()[:, ::-1]
    for i in range(true_k):
        print("Cluster %d:" % i),
        for ind in order_centroids[i, :15]:
            print(' %s' % terms[ind]),
        print('\n')

In [25]:
# Comparing results
def check_result(res, clusters):
    for k in range(clusters):
        for i in range(clusters):
            topic = 't' + str(i)
            acc = len(res[(res.labels == k) & (res.topic == topic)])/len(res[res.labels == k])
            print(topic + ' and ' + str(k) + ' : ' + str(acc))

In [26]:
def test_sent(test_text, trained_model, dicc, word2vec, feature_size):
    test_text = str(test_text)
    if((test_text!=' ')and(test_text!='nan')):
        # replacing all special char with space
        test_text = re.sub('[^a-zA-Z]', ' ', test_text)
        # replacing multiple spaces with single space
        test_text = re.sub('\s+', ' ', test_text)
        # bring whole text to same lower-case scale.
        test_text = test_text.lower()
        wpt = nltk.WordPunctTokenizer()
        sample = []
        for word in test_text.split():
            try:
                sample.append(word)
            except:
                pass
        sample = np.reshape(sample, (-1, len(sample)))
        sample = pd.DataFrame(averaged_word_vectorizer(corpus = sample, model = word2vec,
                                                       num_features = feature_size))
        sent = trained_model.predict(sample)
        result_topic = dicc[sent[0]]
        return result_topic
    else:
        return 'unkown'

In [27]:
path = 'Sample_1.xlsm'
l_col = [ 'you have any comments about 1 - character, please provide them here.',
      'you have any comments about 2 - centres, please provide them here.',
      'you have any comments about 3 - around, please provide them here.',
      'you have any comments about 4 - thing green, please provide them here.' 
      ]
true_k = 4

In [28]:
df = read_file(path)
data = preprocess(df, l_col)

In [29]:
data

Unnamed: 0,text,topic
0,block that bounders and st where 3 storeys are...,t0
1,rapid development of with large apartment buil...,t0
2,i think large apartment developments should be...,t0
3,i believe that allowing 5 storey unit developm...,t0
4,our logic is that prehouses with gardens is th...,t0
...,...,...
838,to need to be the greenspace not more building...,t3
839,you not have development near or cb park,t3
840,trees need better care as many die as not wate...,t3
841,the number of young families and couples movin...,t3


In [30]:
tokenized_corpus = tokenized_text(data)
tokenized_corpus

[['block',
  'that',
  'bounders',
  'and',
  'st',
  'where',
  '3',
  'storeys',
  'are',
  'proposed',
  'should',
  'be',
  'rezoned',
  '5',
  'storey',
  'density',
  'will',
  'be',
  'no',
  'financial',
  'incentive',
  'for',
  'developers',
  'to',
  'buy',
  'out',
  'those',
  'owners',
  'of',
  'the',
  'ugly',
  'six',
  'packs',
  'to',
  'move',
  'on',
  'to',
  'redevelop',
  'into',
  '3',
  'storeys',
  'blocks',
  'already',
  'recently',
  'redeveloped',
  'will',
  'not',
  'change',
  '20',
  'years',
  'from',
  'now',
  'nothing',
  'will',
  'change',
  'unless',
  '5',
  'storey',
  'medium',
  'density',
  'is',
  'allowed',
  'in',
  'this',
  'block'],
 ['rapid',
  'development',
  'of',
  'with',
  'large',
  'apartment',
  'buildings',
  'and',
  'infills',
  'is',
  'very',
  'poor',
  'planning',
  'and',
  'has',
  'dramatically',
  'changed',
  'the',
  'suburb',
  'and',
  'its',
  'character',
  'a',
  'lot',
  'of',
  'it',
  'is',
  'ugly',
  

In [31]:
#read_pretrained model
filename = r'D:\Blackbook.ai\BCC_Neighborhood_NLP\GoogleNews-vectors-negative300.bin'
word2vec = read_word2vec(filename)

In [32]:
index2word = {token_index:token for token_index, token in enumerate(word2vec.index2word)} 
word2index = {token:token_index for token_index, token in enumerate(word2vec.index2word)} 

In [33]:
index2word

{0: '</s>',
 1: 'in',
 2: 'for',
 3: 'that',
 4: 'is',
 5: 'on',
 6: '##',
 7: 'The',
 8: 'with',
 9: 'said',
 10: 'was',
 11: 'the',
 12: 'at',
 13: 'not',
 14: 'as',
 15: 'it',
 16: 'be',
 17: 'from',
 18: 'by',
 19: 'are',
 20: 'I',
 21: 'have',
 22: 'he',
 23: 'will',
 24: 'has',
 25: '####',
 26: 'his',
 27: 'an',
 28: 'this',
 29: 'or',
 30: 'their',
 31: 'who',
 32: 'they',
 33: 'but',
 34: '$',
 35: 'had',
 36: 'year',
 37: 'were',
 38: 'we',
 39: 'more',
 40: '###',
 41: 'up',
 42: 'been',
 43: 'you',
 44: 'its',
 45: 'one',
 46: 'about',
 47: 'would',
 48: 'which',
 49: 'out',
 50: 'can',
 51: 'It',
 52: 'all',
 53: 'also',
 54: 'two',
 55: 'after',
 56: 'first',
 57: 'He',
 58: 'do',
 59: 'time',
 60: 'than',
 61: 'when',
 62: 'We',
 63: 'over',
 64: 'last',
 65: 'new',
 66: 'other',
 67: 'her',
 68: 'people',
 69: 'into',
 70: 'In',
 71: 'our',
 72: 'there',
 73: 'A',
 74: 'she',
 75: 'could',
 76: 'just',
 77: 'years',
 78: 'some',
 79: 'U.S.',
 80: 'three',
 81: 'million'

In [34]:
word2index

{'</s>': 0,
 'in': 1,
 'for': 2,
 'that': 3,
 'is': 4,
 'on': 5,
 '##': 6,
 'The': 7,
 'with': 8,
 'said': 9,
 'was': 10,
 'the': 11,
 'at': 12,
 'not': 13,
 'as': 14,
 'it': 15,
 'be': 16,
 'from': 17,
 'by': 18,
 'are': 19,
 'I': 20,
 'have': 21,
 'he': 22,
 'will': 23,
 'has': 24,
 '####': 25,
 'his': 26,
 'an': 27,
 'this': 28,
 'or': 29,
 'their': 30,
 'who': 31,
 'they': 32,
 'but': 33,
 '$': 34,
 'had': 35,
 'year': 36,
 'were': 37,
 'we': 38,
 'more': 39,
 '###': 40,
 'up': 41,
 'been': 42,
 'you': 43,
 'its': 44,
 'one': 45,
 'about': 46,
 'would': 47,
 'which': 48,
 'out': 49,
 'can': 50,
 'It': 51,
 'all': 52,
 'also': 53,
 'two': 54,
 'after': 55,
 'first': 56,
 'He': 57,
 'do': 58,
 'time': 59,
 'than': 60,
 'when': 61,
 'We': 62,
 'over': 63,
 'last': 64,
 'new': 65,
 'other': 66,
 'her': 67,
 'people': 68,
 'into': 69,
 'In': 70,
 'our': 71,
 'there': 72,
 'A': 73,
 'she': 74,
 'could': 75,
 'just': 76,
 'years': 77,
 'some': 78,
 'U.S.': 79,
 'three': 80,
 'million': 81

In [35]:
feature_size = 300    # Word vector dimensionality  
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=word2vec,
                                             num_features=feature_size)# get document level embeddings
sentence_with_embedding = pd.DataFrame(w2v_feature_array)
sentence_with_embedding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.023214,-0.006714,0.064401,0.073133,-0.059627,-0.043599,0.047528,-0.053001,0.051799,0.095397,...,-0.117057,0.020943,-0.068164,0.029496,0.024905,0.031825,-0.035953,-0.063669,0.013590,-0.054310
1,0.051753,0.039621,0.025340,0.069039,-0.053083,-0.037171,0.024907,-0.084676,0.065206,0.068440,...,-0.090238,0.014225,-0.054791,0.063705,-0.001520,0.020743,0.022604,-0.035064,0.050186,-0.041322
2,0.014664,0.038765,0.034824,0.125324,-0.067249,-0.029823,0.033368,-0.062650,0.030625,0.073065,...,-0.083568,0.030815,-0.076470,0.023152,-0.022078,-0.013197,-0.015569,-0.051680,0.062030,-0.064512
3,-0.010701,0.030304,0.054109,0.081180,-0.077948,-0.004409,0.034984,-0.051645,0.074295,0.064262,...,-0.082243,0.000819,-0.045814,0.025555,0.020977,0.036075,-0.052010,-0.056929,0.066396,-0.048546
4,0.027871,0.032892,0.038572,0.074852,-0.080632,-0.027350,0.046168,-0.059409,0.061953,0.061391,...,-0.122690,0.025577,-0.067139,0.008679,-0.026923,0.009250,-0.004940,-0.050601,0.053983,-0.027275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
838,0.022604,0.017110,0.014440,0.104725,-0.055223,-0.009985,0.053157,-0.073784,0.061795,0.053846,...,-0.078055,0.037884,-0.063528,0.034226,-0.000171,-0.013184,-0.003839,-0.045739,0.031994,-0.040155
839,0.039272,-0.068778,0.065273,0.137626,-0.092390,-0.018851,0.042737,-0.066232,-0.004551,0.055965,...,-0.086600,0.096034,-0.070326,0.035749,0.044528,-0.034616,-0.047435,-0.126866,0.014714,-0.035505
840,0.036872,0.064374,0.028909,0.112728,-0.091183,0.002245,0.076359,-0.025385,0.109442,0.062135,...,-0.045421,0.020182,-0.101699,0.032830,-0.040882,0.062439,-0.043146,-0.021549,0.083983,-0.038553
841,0.042835,0.018250,0.036113,0.074287,-0.080746,-0.037904,0.002342,-0.056982,0.043012,0.045480,...,-0.105935,0.015379,-0.091042,0.045043,0.038266,0.003242,-0.003562,-0.025918,0.032084,-0.038244


In [36]:
model = fit_model(sentence_with_embedding, true_k)
terms_per_cluster(model, index2word)

Top terms per cluster:
Cluster 0:
 expected
 her
 there
 U.S.
 like
 police
 put
 just
 took
 come
 area
 years
 that
 added
 while


Cluster 1:
 against
 added
 business
 top
 They
 Tuesday
 team
 are
 took
 early
 'm
 been
 under
 place
 #-#


Cluster 2:
 season
 against
 put
 expected
 Tuesday
 how
 off
 too
 Monday
 team
 They
 early
 five
 It
 been


Cluster 3:
 expected
 put
 there
 her
 police
 added
 took
 U.S.
 top
 like
 season
 how
 too
 They
 while




In [37]:
check_result(data, true_k)

t0 and 0 : 0.34231378763866877
t1 and 0 : 0.24088748019017434
t2 and 0 : 0.2202852614896989
t3 and 0 : 0.196513470681458
t0 and 1 : 0.0
t1 and 1 : 0.0
t2 and 1 : 0.0
t3 and 1 : 1.0
t0 and 2 : 0.0
t1 and 2 : 0.375
t2 and 2 : 0.5
t3 and 2 : 0.125
t0 and 3 : 0.38613861386138615
t1 and 3 : 0.18316831683168316
t2 and 3 : 0.18811881188118812
t3 and 3 : 0.24257425742574257


In [41]:
dic = {
    0:'t0' ,
    1:'t3' ,
    2:'t2' ,
    3:'t1' 
}

In [42]:
data.labels = data.labels.replace(dic)

In [43]:
data

Unnamed: 0,text,topic,labels
0,block that bounders and st where 3 storeys are...,t0,t0
1,rapid development of with large apartment buil...,t0,t0
2,i think large apartment developments should be...,t0,t0
3,i believe that allowing 5 storey unit developm...,t0,t0
4,our logic is that prehouses with gardens is th...,t0,t0
...,...,...,...
838,to need to be the greenspace not more building...,t3,t0
839,you not have development near or cb park,t3,t2
840,trees need better care as many die as not wate...,t3,t0
841,the number of young families and couples movin...,t3,t0


In [44]:
len(data[data.labels == data.topic])/len(data)

0.30723606168446027

In [45]:
test_text = """
i think large apartment developments should be built along major road e g street street etc i do not agree with building them along areas with large parks as the people that live here enjoy the being able to look out their windows and see the park this is why they live here
"""
test_sent(test_text, model, dic, word2vec, feature_size)

't0'

In [46]:
df_test = pd.read_excel('File_Test1.xlsx')
df_test.head()

Unnamed: 0,ID number,Comments
0,1,its heading in the right direction. completely...
1,2,for the environment. extremely small residenti...
2,3,"to preserving the bushland, commitment to ensu..."
3,4,N/A. bus service for dont want town houses at ...
4,5,"there is a strategy, and subsequently a plan....."


In [47]:
# Predicting topics for each sentences in the test dataset
for index, row in df_test.iterrows():
    df_test.at[index, 'label'] = test_sent(row.Comments, model, dic, word2vec, feature_size)

In [48]:
df_test.label.value_counts()

t0    116
t1     60
Name: label, dtype: int64

In [49]:
df_test.head()

Unnamed: 0,ID number,Comments,label
0,1,its heading in the right direction. completely...,t0
1,2,for the environment. extremely small residenti...,t0
2,3,"to preserving the bushland, commitment to ensu...",t1
3,4,N/A. bus service for dont want town houses at ...,t0
4,5,"there is a strategy, and subsequently a plan.....",t0
