In [1]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict
import random
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
from gensim.models import Word2Vec
#nltk.download('stopwords')
#nltk.download('punkt')

In [5]:
data = pd.read_excel(r"C:\Users\anike\Desktop\R system\nlp\Book1.xlsx")

In [6]:
data.head()

Unnamed: 0,OSAT Comment,Tag1,Tag2,Tag3,Tag4
0,John came on the day and time allotted. He ca...,Punctual/Arrived Promptly,Issue resolved,Communicative,
1,Andrew is pleasant and friendly on meeting the...,Pleasant,Helpful/Friendly Staff,Respectful/Patient,Clear Explanation
2,"He was very helpful,explain everything that he...",Helpful/Friendly Staff,Clear Explanation,,
3,"Very professional, knowledgeable and kind - th...",Professional Staff,Knowledgeable/Skillful/Experienced/Confident,Careful/Understandable/Kind,
4,He was quick and efficient,Quick Resolution/Response,Efficient/Effective,,


In [4]:
data.shape

(601, 6)

In [7]:
comments_data = data.loc[:,'OSAT Comment']

In [8]:
comments_data.head()

0    John came on the day and time allotted.  He ca...
1    Andrew is pleasant and friendly on meeting the...
2    He was very helpful,explain everything that he...
3    Very professional, knowledgeable and kind - th...
4                           He was quick and efficient
Name: OSAT Comment, dtype: object

In [9]:
type(comments_data)

pandas.core.series.Series

# Removing Punctuations

In [10]:
# removing punctuations from the comments
comments_data = comments_data.apply(lambda x: re.sub('[^\w\s]','', x))

In [11]:
comments_data.head()

0    John came on the day and time allotted  He cam...
1    Andrew is pleasant and friendly on meeting the...
2    He was very helpfulexplain everything that he ...
3    Very professional knowledgeable and kind  the ...
4                           He was quick and efficient
Name: OSAT Comment, dtype: object

# Converting to lower case

In [12]:
comments_data = comments_data.apply(lambda x: x.lower())

In [13]:
comments_data.head()

0    john came on the day and time allotted  he cam...
1    andrew is pleasant and friendly on meeting the...
2    he was very helpfulexplain everything that he ...
3    very professional knowledgeable and kind  the ...
4                           he was quick and efficient
Name: OSAT Comment, dtype: object

# Stopwords removal

In [14]:
all_stopwords = stopwords.words('english')
print(all_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
required_stopwords = all_stopwords[:all_stopwords.index('ain')]

In [16]:
val = ['not','don','don\'t']
required_stopwords = [stopword for stopword in required_stopwords if stopword not in val]
required_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
comments_data = comments_data.apply(lambda x: ' '.join([y for y in x.split() if y not in required_stopwords]))

In [18]:
comments_data.head()

0    john came day time allotted came house test er...
1    andrew pleasant friendly meeting client also a...
2    helpfulexplain everything doingso understood n...
3                 professional knowledgeable kind best
4                                      quick efficient
Name: OSAT Comment, dtype: object

# Generate N-grams

In [19]:
def create_grams(text, n = 2):
    
    words = text.split()
    temp=zip(*[words[i:] for i in range(0,n)])
    
    ans=[' '.join(n) for n in temp]
    return ans

In [20]:
x = create_grams('john came day time allotted came house test', n=2)
print(x)

['john came', 'came day', 'day time', 'time allotted', 'allotted came', 'came house', 'house test']


In [21]:
comments_grams = comments_data.apply(create_grams)

In [22]:
comments_grams

0      [john came, came day, day time, time allotted,...
1      [andrew pleasant, pleasant friendly, friendly ...
2      [helpfulexplain everything, everything doingso...
3      [professional knowledgeable, knowledgeable kin...
4                                      [quick efficient]
                             ...                        
596    [punctual friendly, friendly hard, hard workin...
597                          [polite hard, hard working]
598                                     [prompt service]
599                                [helpful informative]
600                                     [polite helpful]
Name: OSAT Comment, Length: 601, dtype: object

In [23]:
grams_dict = defaultdict(int)
for comment in comments_grams:
    for grams in comment:
        grams_dict[grams] += 1
print(grams_dict)

defaultdict(<class 'int'>, {'john came': 1, 'came day': 1, 'day time': 1, 'time allotted': 1, 'allotted came': 1, 'came house': 1, 'house test': 1, 'test error': 1, 'error broadband': 1, 'broadband internet': 1, 'internet problems': 2, 'problems went': 1, 'went checked': 1, 'checked main': 1, 'main phone': 1, 'phone wire': 1, 'wire near': 1, 'near property': 1, 'property came': 1, 'came back': 2, 'back told': 1, 'told error': 1, 'error end': 1, 'end would': 1, 'would go': 1, 'go box': 1, 'box would': 1, 'would fix': 2, 'fix error': 1, 'error really': 1, 'really bad': 1, 'bad day': 1, 'day weather': 1, 'weather wise': 1, 'wise pouring': 1, 'pouring rain': 1, 'rain fix': 1, 'fix problem': 32, 'problem phoned': 1, 'phoned let': 1, 'let know': 3, 'know went': 1, 'went way': 4, 'andrew pleasant': 1, 'pleasant friendly': 3, 'friendly meeting': 1, 'meeting client': 1, 'client also': 1, 'also actually': 1, 'actually listens': 1, 'listens say': 1, 'say explains': 1, 'explains enough': 1, 'enoug

In [24]:
grams_dict

defaultdict(int,
            {'john came': 1,
             'came day': 1,
             'day time': 1,
             'time allotted': 1,
             'allotted came': 1,
             'came house': 1,
             'house test': 1,
             'test error': 1,
             'error broadband': 1,
             'broadband internet': 1,
             'internet problems': 2,
             'problems went': 1,
             'went checked': 1,
             'checked main': 1,
             'main phone': 1,
             'phone wire': 1,
             'wire near': 1,
             'near property': 1,
             'property came': 1,
             'came back': 2,
             'back told': 1,
             'told error': 1,
             'error end': 1,
             'end would': 1,
             'would go': 1,
             'go box': 1,
             'box would': 1,
             'would fix': 2,
             'fix error': 1,
             'error really': 1,
             'really bad': 1,
             'bad day': 1,
    

In [25]:
grams_df = pd.Series(grams_dict)
grams_df = grams_df.reset_index()
grams_df.columns = ['bigrams', 'frequency']
grams_df.sort_values(by = 'frequency', ascending = False, inplace = True)
grams_df.reset_index(inplace = True, drop = True)
grams_df.head()

Unnamed: 0,bigrams,frequency
0,great service,41
1,fix problem,32
2,good communication,22
3,polite friendly,19
4,polite helpful,18


In [24]:
grams_df

Unnamed: 0,bigrams,frequency
0,great service,41
1,fix problem,32
2,good communication,22
3,polite friendly,19
4,polite helpful,18
...,...,...
3895,early efficient,1
3896,efficient personable,1
3897,informative also,1
3898,quick proud,1


In [26]:
grams_df.to_csv('trigrams_frequency.csv')

# Finding top tags(N-grams)

In [27]:
comments_data.head()

0    john came day time allotted came house test er...
1    andrew pleasant friendly meeting client also a...
2    helpfulexplain everything doingso understood n...
3                 professional knowledgeable kind best
4                                      quick efficient
Name: OSAT Comment, dtype: object

In [28]:
comments_tokenized = comments_data.apply(lambda x: nltk.word_tokenize(x))

In [29]:
comments_tokenized.head()

0    [john, came, day, time, allotted, came, house,...
1    [andrew, pleasant, friendly, meeting, client, ...
2    [helpfulexplain, everything, doingso, understo...
3            [professional, knowledgeable, kind, best]
4                                   [quick, efficient]
Name: OSAT Comment, dtype: object

In [30]:
comments_grams.head()

0    [john came, came day, day time, time allotted,...
1    [andrew pleasant, pleasant friendly, friendly ...
2    [helpfulexplain everything, everything doingso...
3    [professional knowledgeable, knowledgeable kin...
4                                    [quick efficient]
Name: OSAT Comment, dtype: object

In [31]:
comments_word_vectors = Word2Vec(comments_tokenized, vector_size=50)

In [32]:
comments_word_vectors1 = comments_word_vectors.wv

In [33]:
comments_word_vectors1['pleasant']

array([-0.00277891, -0.01057127, -0.01643177, -0.00991831,  0.03525567,
       -0.0554473 ,  0.02078449,  0.07581734, -0.05024346, -0.03817106,
        0.00490142, -0.07395313,  0.004579  ,  0.03963151, -0.00690912,
        0.03282344,  0.03655024, -0.00639071, -0.04651167, -0.04171829,
        0.01682314,  0.0172086 ,  0.04149372, -0.01473533,  0.05274713,
        0.00268791, -0.02824595,  0.02305246, -0.03404203, -0.01056755,
        0.01767786,  0.00650467, -0.01134054,  0.00013037, -0.05319519,
        0.01165644,  0.06652086,  0.01562475,  0.02353692, -0.00587001,
        0.06581862, -0.00931695, -0.00558975, -0.01050377,  0.0612694 ,
        0.02042017, -0.03372247, -0.00386441,  0.02891462,  0.02891601],
      dtype=float32)

In [33]:
grams_df

Unnamed: 0,bigrams,frequency
0,great service,41
1,fix problem,32
2,good communication,22
3,polite friendly,19
4,polite helpful,18
...,...,...
3895,early efficient,1
3896,efficient personable,1
3897,informative also,1
3898,quick proud,1


In [34]:
def generate_vectors_for_ngrams(data):
    new_data = data.copy()
    new_data['vectors'] = new_data['bigrams'].apply(lambda x: nltk.word_tokenize(x))
    new_data['avg_vectors'] = new_data['vectors']
    #print(new_data.head())
    
    for i in range(len(new_data)):
        for k in range(len(new_data.iloc[i,2])):
            if new_data.iloc[i,2][k] not in comments_word_vectors1:
                new_data.iloc[i,2][k] = np.zeros(50)
            else:
                new_data.iloc[i,2][k] = comments_word_vectors1[new_data.iloc[i,2][k]]
        vec = np.asarray(new_data.iloc[i,2])
        #print(vec)
        avg_vec = vec.mean(axis = 0)
        new_data['avg_vectors'][i] = avg_vec
        
    return new_data

ngrams_vec = generate_vectors_for_ngrams(data = grams_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['avg_vectors'][i] = avg_vec


In [35]:
ngrams_vec.head()

Unnamed: 0,bigrams,frequency,vectors,avg_vectors
0,great service,41,"[[0.015589557, 0.0018338881, -0.020247156, -0....","[0.01342584, 3.93847e-05, -0.017523328, -0.043..."
1,fix problem,32,"[[0.022351824, -0.00039258986, 0.015675824, -0...","[0.008265625, 0.011611663, 0.007302192, -0.044..."
2,good communication,22,"[[0.0216104, -0.006992211, 0.01734015, -0.0009...","[0.015815035, 0.0057261297, 0.0069640637, -0.0..."
3,polite friendly,19,"[[0.009424436, 0.0054805884, 0.008775975, -0.0...","[0.0004877993, 0.008992991, -0.00035146717, -0..."
4,polite helpful,18,"[[0.009424436, 0.0054805884, 0.008775975, -0.0...","[0.0076359934, 0.007206866, -0.0029129456, -0...."


In [36]:
ngrams_vec.head(100)

Unnamed: 0,bigrams,frequency,vectors,avg_vectors
0,great service,41,"[[0.015589557, 0.0018338881, -0.020247156, -0....","[0.01342584, 3.93847e-05, -0.017523328, -0.043..."
1,fix problem,32,"[[0.022351824, -0.00039258986, 0.015675824, -0...","[0.008265625, 0.011611663, 0.007302192, -0.044..."
2,good communication,22,"[[0.0216104, -0.006992211, 0.01734015, -0.0009...","[0.015815035, 0.0057261297, 0.0069640637, -0.0..."
3,polite friendly,19,"[[0.009424436, 0.0054805884, 0.008775975, -0.0...","[0.0004877993, 0.008992991, -0.00035146717, -0..."
4,polite helpful,18,"[[0.009424436, 0.0054805884, 0.008775975, -0.0...","[0.0076359934, 0.007206866, -0.0029129456, -0...."
...,...,...,...,...
95,helpful efficient,4,"[[0.005847551, 0.008933144, -0.014601866, -0.0...","[-0.0030030962, 0.0096694315, -0.0024842634, -..."
96,engineer took,4,"[[-0.0054649618, 0.005278886, -0.015409239, -0...","[0.0048055737, 0.0066169556, -0.0030304994, -0..."
97,went way,4,"[[0.00021664191, -0.009440921, 0.014661997, -0...","[-0.000743998, -0.012376807, 0.0048500476, -0...."
98,efficient service,4,"[[-0.011853743, 0.010405719, 0.009633339, -0.0...","[-0.0002958104, 0.0043253005, -0.0025830804, -..."


In [37]:
def similar_documents(mytext, df):
    df_new = df.copy()
    #input_vect = vectorizer.transform([mytext])
    input_vect = df_new.loc[df_new['bigrams']==mytext,'avg_vectors']
    input_vect = input_vect.iloc[0]
    input_vect = input_vect.reshape(1,-1)
    #print(input_vect.shape)
    df_new['similarity'] = df_new['avg_vectors'].apply(lambda x: cosine_similarity(input_vect,x.reshape(1,-1)))
    df_new['similarity'] = df_new['similarity'].apply(lambda x: float(x))
    #df_ngram['similarity'] = cosine_similarity(input_vect, X).flatten()
    return (df_new.sort_values(by='similarity', ascending=False)[['bigrams', 'similarity']])
 
 

In [38]:
all_bigrams = ngrams_vec['bigrams'].to_list()

In [39]:
#calculating cosine similarity
all_bigrams_copy = all_bigrams.copy()
#random.shuffle(all_bigrams_copy)
all_bigrams_copy = all_bigrams_copy[:100]
final_tags = []
idx = 0
while idx < len(all_bigrams_copy)-1:
    similarity_df = similar_documents(all_bigrams_copy[idx], ngrams_vec)
    similarity_df_req = similarity_df[similarity_df['similarity'] >= 0.95]
    
    lst = similarity_df_req['bigrams'].to_list()
    lst1 = lst[1:]
    print('ngram :', lst[0])
    print(len(lst1))
    all_bigrams_copy = all_bigrams_copy[:idx] + [x for x in all_bigrams_copy[idx:] if x not in lst1]
    final_tags.append(all_bigrams_copy[idx])
    
    print(idx, end = ' ')
    idx += 1


ngram : great service
281
0 ngram : communication good
23
1 ngram : explained everything
232
2 ngram : polite efficient
591
3 ngram : hard working
19
4 ngram : knowledgeable friendly
111
5 ngram : went beyond
35
6 ngram : social distancing
19
7 ngram : covid rules
70
8 ngram : extremely helpful
79
9 ngram : respectful property
111
10 ngram : positive attitude
30
11 ngram : found fault
259
12 ngram : answered questions
2
13 ngram : job efficiently
99
14 ngram : arrived promptly
32
15 ngram : followed covid
57
16 ngram : really nice
40
17 ngram : clean tidy
11
18 ngram : turned time
62
19 ngram : communicated well
0
20 ngram : easy talk
1
21 ngram : observed covid
40
22 ngram : credit openreach
1
23 ngram : problem solved
143
24 ngram : make sure
18
25 ngram : wore face
12
26 ngram : professional manner
63
27 ngram : wearing mask
58
28 ngram : able fix
65
29 

In [41]:
print('length of all_bigrams_copy:',len(all_bigrams_copy))
print('length of final_tags:',len(final_tags))



length of all_bigrams_copy: 31
length of final_tags: 30


In [42]:
print(all_bigrams_copy)

['great service', 'polite friendly', 'explained everything', 'polite efficient', 'hard working', 'kept informed', 'went beyond', 'social distancing', 'covid rules', 'extremely helpful', 'respectful property', 'positive attitude', 'found fault', 'answered questions', 'job efficiently', 'arrived promptly', 'followed covid', 'really nice', 'clean tidy', 'turned time', 'communicated well', 'easy talk', 'observed covid', 'credit openreach', 'safety conscious', 'make sure', 'wore face', 'professional manner', 'wearing mask', 'able fix', 'service received']


# Tagging the verbatims

In [43]:
# verify manually for 25 comments:
comments_grams.head()

0    [john came, came day, day time, time allotted,...
1    [andrew pleasant, pleasant friendly, friendly ...
2    [helpfulexplain everything, everything doingso...
3    [professional knowledgeable, knowledgeable kin...
4                                    [quick efficient]
Name: OSAT Comment, dtype: object

<gensim.models.keyedvectors.KeyedVectors at 0x15bc64dc6d0>

In [51]:
comments_grams_copy = comments_grams.copy()
#comments_grams_copy = comments_grams_copy[:5]
#comments_grams_copy = pd.DataFrame(comments_grams_copy)
grams_vec_ser = pd.Series(name = 'vectors_all', dtype = float)
for idx,lst in enumerate(comments_grams_copy):
    print(idx, end = ' ')
    gram_avg_vec_lst = []
    for gram in lst:
        tokens  = nltk.word_tokenize(gram)
        gram_vec = []
        for word in tokens:
            if word not in comments_word_vectors1:
                gram_vec.append(np.zeros(50))
            else:
                gram_vec.append(comments_word_vectors1[word])
        
        gram_vec_avg = np.asarray(gram_vec).mean(axis = 0)
        gram_avg_vec_lst.append(gram_vec_avg.tolist())
        
    #print(gram_avg_vec_lst)
    #print(len(gram_avg_vec_lst))
    #print(type(gram_avg_vec_lst[0]))
    #print('##################')
    #grams_vec_ser = pd.concat([grams_vec_ser,pd.Series(gram_avg_vec_lst)], axis =1)
    grams_vec_ser.loc[idx] = gram_avg_vec_lst
    #print(grams_vec_ser)

comments_grams_copy = pd.concat([comments_grams_copy,grams_vec_ser], axis = 1)    
      

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 27

In [130]:
comments_grams_copy

Unnamed: 0,OSAT Comment,vectors_all
0,"[john came, came day, day time, time allotted,...","[[0.01334892213344574, 0.009084227494895458, 0..."
1,"[andrew pleasant, pleasant friendly, friendly ...","[[-0.0013894560979679227, -0.00528563605621457..."
2,"[helpfulexplain everything, everything doingso...","[[-0.0009466104093007743, 0.000368529174011200..."
3,"[professional knowledgeable, knowledgeable kin...","[[0.0034444970078766346, 0.0016729412600398064..."
4,[quick efficient],"[[-0.0011313194409012794, 0.000640564132481813..."
...,...,...
596,"[punctual friendly, friendly hard, hard workin...","[[-0.0003798645921051502, 0.004209750331938267..."
597,"[polite hard, hard working]","[[0.004958176054060459, 0.0005674920976161957,..."
598,[prompt service],"[[0.015336825512349606, -0.006925190798938274,..."
599,[helpful informative],"[[0.00014675455167889595, -0.00188855407759547..."


In [134]:
tags_gram_vec = ngrams_vec[ngrams_vec['bigrams'].isin(all_bigrams_copy)]
tags_gram_vec.reset_index(drop = True, inplace = True)

In [135]:
tags_gram_vec.head()

Unnamed: 0,bigrams,frequency,vectors,avg_vectors
0,great service,41,"[[0.015589557, 0.0018338892, -0.020247156, -0....","[0.01342584, 3.938569e-05, -0.017523328, -0.04..."
1,polite friendly,19,"[[0.009424437, 0.0054805893, 0.008775977, -0.0...","[0.00048779882, 0.008992991, -0.0003514653, -0..."
2,explained everything,17,"[[-0.010104227, 0.0073551536, -0.002324775, -0...","[-0.0059987237, 0.004046106, -0.009104758, -0...."
3,polite efficient,17,"[[0.009424437, 0.0054805893, 0.008775977, -0.0...","[-0.0012146533, 0.007943154, 0.00920466, -0.01..."
4,hard working,14,"[[0.00049191545, -0.004345605, -0.01701518, -0...","[-0.0021414813, -0.0083319135, -0.008154917, -..."


In [148]:
tags_gram_vec.shape

(31, 4)

In [180]:
tags_gram_vec_copy = tags_gram_vec[['bigrams', 'avg_vectors']].copy()
tags_gram_vec_copy['similarity'] = tags_gram_vec['frequency']
comments_grams_copy_1 = comments_grams_copy.iloc[:,:].copy()
comments_data_copy = comments_data.copy()
comments_data_copy = comments_data_copy.reset_index()
comments_data_copy.columns = ['Tags', 'OSAT Comment']
comments_data_copy = comments_data_copy[['OSAT Comment', 'Tags']]
for idx,lst in enumerate(comments_grams_copy_1['vectors_all']):
    print(idx, end = ' ')
    tags_lst = []
    for gram_vec in lst:
        for k, tag_vec in enumerate(tags_gram_vec['avg_vectors']):
            tags_gram_vec_copy.loc[k,'similarity'] = float(cosine_similarity(tag_vec.reshape(1,-1),np.asarray(gram_vec).reshape(1,-1)))
        tags_gram_vec_copy.sort_values(by = 'similarity',ascending=False)
        tags_lst.extend(tags_gram_vec_copy[tags_gram_vec_copy['similarity'] > 0.95]['bigrams'].tolist())
    tags_lst = list(set(tags_lst))
    #print(tags_lst)
    comments_data_copy.loc[idx,'Tags'] = ', '.join(tags_lst)        

for i,tag in enumerate(comments_data_copy['Tags']):
    if tag == '':
        comments_data_copy.loc[i,'Tags'] = comments_data_copy.loc[i,'OSAT Comment']
        
comments_data_copy.head()

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 27

Unnamed: 0,OSAT Comment,Tags
0,john came day time allotted came house test er...,"polite efficient, respectful property, went be..."
1,andrew pleasant friendly meeting client also a...,"polite efficient, found fault, able fix, polit..."
2,helpfulexplain everything doingso understood n...,"polite efficient, great service, polite friend..."
3,professional knowledgeable kind best,professional knowledgeable kind best
4,quick efficient,polite efficient


In [179]:
comments_data_copy.head(10)

Unnamed: 0,OSAT Comment,Tags
0,john came day time allotted came house test er...,"polite efficient, respectful property, went be..."
1,andrew pleasant friendly meeting client also a...,"polite efficient, found fault, able fix, polit..."
2,helpfulexplain everything doingso understood n...,"polite efficient, great service, polite friend..."
3,professional knowledgeable kind best,professional knowledgeable kind best
4,quick efficient,polite efficient
5,steve really helpful would give 10 not wear mask,"wearing mask, polite friendly, extremely helpf..."
6,polite friendly patient reassuring problem sol...,"polite efficient, found fault, polite friendly..."
7,efficient helpful completely sorted problem,"polite efficient, great service, extremely hel..."
8,clear communication issue speedy resolution,clear communication issue speedy resolution
9,job efficiently observed good hygiene polite h...,"polite efficient, arrived promptly, job effici..."


In [181]:
comments_data_copy[comments_data_copy['Tags'] == '']

Unnamed: 0,OSAT Comment,Tags
